[llvm] 8a70102 - [ARM] Lower i1 concat via MVETRUNC
David Green via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 18 11:40:16 PDT 2023
Author: David Green
Date: 2023-10-18T19:40:11+01:00
New Revision: 8a701024f3e093c5f1cf6dd022f57baff0551a49
URL: https://github.com/llvm/llvm-project/commit/8a701024f3e093c5f1cf6dd022f57baff0551a49
DIFF: https://github.com/llvm/llvm-project/commit/8a701024f3e093c5f1cf6dd022f57baff0551a49.diff
LOG: [ARM] Lower i1 concat via MVETRUNC
The MVETRUNC operation can perform the same truncate of two vectors, without
requiring lane inserts/extracts from every vector lane. This moves the concat
i1 lowering to use it for v8i1 and v16i1 result types, trading a bit of extra
stack space for less instructions.
Added:
Modified:
llvm/lib/Target/ARM/ARMISelLowering.cpp
llvm/test/CodeGen/ARM/fadd-select-fneg-combine.ll
llvm/test/CodeGen/Thumb2/active_lane_mask.ll
llvm/test/CodeGen/Thumb2/mve-concat.ll
llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
llvm/test/CodeGen/Thumb2/mve-phireg.ll
llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 6e58cbaf2ac4a84..9fe5dd55a810c4b 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -9095,13 +9095,21 @@ static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
unsigned NumElts = 2 * Op1VT.getVectorNumElements();
+ EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
+ if (Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) {
+ // Use MVETRUNC to truncate the combined NewV1::NewV2 into the smaller
+ // ConcatVT.
+ SDValue ConVec =
+ DAG.getNode(ARMISD::MVETRUNC, dl, ConcatVT, NewV1, NewV2);
+ return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
+ DAG.getConstant(ARMCC::NE, dl, MVT::i32));
+ }
+
// Extract the vector elements from Op1 and Op2 one by one and truncate them
// to be the right size for the destination. For example, if Op1 is v4i1
// then the promoted vector is v4i32. The result of concatenation gives a
// v8i1, which when promoted is v8i16. That means each i32 element from Op1
// needs truncating to i16 and inserting in the result.
- EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
- SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
EVT NewVT = NewV.getValueType();
EVT ConcatVT = ConVec.getValueType();
@@ -9119,6 +9127,7 @@ static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
return ConVec;
};
unsigned j = 0;
+ SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
ConVec = ExtractInto(NewV1, ConVec, j);
ConVec = ExtractInto(NewV2, ConVec, j);
diff --git a/llvm/test/CodeGen/ARM/fadd-select-fneg-combine.ll b/llvm/test/CodeGen/ARM/fadd-select-fneg-combine.ll
index 656bce616ea0486..3f2b40460917e48 100644
--- a/llvm/test/CodeGen/ARM/fadd-select-fneg-combine.ll
+++ b/llvm/test/CodeGen/ARM/fadd-select-fneg-combine.ll
@@ -284,40 +284,33 @@ define half @fadd_select_fneg_posk_f16(i32 %arg0, half %x, half %y) {
define <8 x half> @fadd_vselect_fneg_posk_v8f16(<8 x i32> %arg0, <8 x half> %x, <8 x half> %y) {
; CHECK-LABEL: fadd_vselect_fneg_posk_v8f16:
; CHECK: @ %bb.0:
-; CHECK-NEXT: push {r4, r5, r6, lr}
+; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: vmov d0, r0, r1
-; CHECK-NEXT: vmov.i8 q1, #0xff
-; CHECK-NEXT: vmov d1, r2, r3
; CHECK-NEXT: add r0, sp, #16
+; CHECK-NEXT: vmov d1, r2, r3
+; CHECK-NEXT: vldrw.u32 q3, [r0]
; CHECK-NEXT: vcmp.i32 eq, q0, zr
; CHECK-NEXT: vmov.i8 q0, #0x0
+; CHECK-NEXT: vmov.i8 q1, #0xff
+; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vpsel q2, q1, q0
-; CHECK-NEXT: vldrw.u32 q3, [r0]
-; CHECK-NEXT: vmov r2, r1, d4
-; CHECK-NEXT: add r12, sp, #32
-; CHECK-NEXT: vmov r4, r5, d5
-; CHECK-NEXT: vmov.16 q2[0], r2
-; CHECK-NEXT: vmov.16 q2[1], r1
; CHECK-NEXT: vcmp.i32 eq, q3, zr
-; CHECK-NEXT: vpsel q1, q1, q0
-; CHECK-NEXT: vmov.16 q2[2], r4
-; CHECK-NEXT: vmov r3, r0, d2
-; CHECK-NEXT: vmov.16 q2[3], r5
-; CHECK-NEXT: vmov.16 q2[4], r3
-; CHECK-NEXT: vmov r6, lr, d3
-; CHECK-NEXT: vmov.16 q2[5], r0
-; CHECK-NEXT: vldrw.u32 q1, [r12]
-; CHECK-NEXT: vmov.16 q2[6], r6
-; CHECK-NEXT: vmov.i16 q0, #0xc400
-; CHECK-NEXT: vmov.16 q2[7], lr
+; CHECK-NEXT: vpsel q0, q1, q0
+; CHECK-NEXT: vstrh.32 q2, [r0]
+; CHECK-NEXT: vstrh.32 q0, [r0, #8]
+; CHECK-NEXT: add r1, sp, #32
+; CHECK-NEXT: vldrw.u32 q2, [r0]
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vmov.i16 q1, #0xc400
; CHECK-NEXT: add r0, sp, #48
; CHECK-NEXT: vcmp.i16 ne, q2, zr
-; CHECK-NEXT: vpsel q0, q1, q0
+; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vsub.f16 q0, q1, q0
; CHECK-NEXT: vmov r0, r1, d0
; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: pop {r4, r5, r6, pc}
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: bx lr
%cmp = icmp eq <8 x i32> %arg0, zeroinitializer
%neg.x = fneg <8 x half> %x
%select = select <8 x i1> %cmp, <8 x half> %neg.x, <8 x half> <half 4.0, half 4.0, half 4.0, half 4.0, half 4.0, half 4.0, half 4.0, half 4.0>
diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
index 3ce79225cd5e612..bcd92f81911b26d 100644
--- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
+++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
@@ -146,54 +146,47 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) {
define <8 x i16> @v8i16(i32 %index, i32 %TC, <8 x i16> %V1, <8 x i16> %V2) {
; CHECK-LABEL: v8i16:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vpush {d8, d9}
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: adr.w r12, .LCPI3_0
; CHECK-NEXT: vdup.32 q1, r1
; CHECK-NEXT: vldrw.u32 q0, [r12]
-; CHECK-NEXT: vmov.i8 q2, #0x0
-; CHECK-NEXT: vmov.i8 q3, #0xff
+; CHECK-NEXT: vmov.i8 q2, #0xff
+; CHECK-NEXT: mov r4, sp
+; CHECK-NEXT: adr r1, .LCPI3_1
; CHECK-NEXT: vqadd.u32 q0, q0, r0
; CHECK-NEXT: vcmp.u32 hi, q1, q0
-; CHECK-NEXT: vpsel q4, q3, q2
-; CHECK-NEXT: vmov r1, r12, d8
-; CHECK-NEXT: vmov.16 q0[0], r1
-; CHECK-NEXT: vmov.16 q0[1], r12
-; CHECK-NEXT: vmov r1, r12, d9
-; CHECK-NEXT: vmov.16 q0[2], r1
-; CHECK-NEXT: adr r1, .LCPI3_1
-; CHECK-NEXT: vldrw.u32 q4, [r1]
-; CHECK-NEXT: vmov.16 q0[3], r12
-; CHECK-NEXT: vqadd.u32 q4, q4, r0
-; CHECK-NEXT: vcmp.u32 hi, q1, q4
-; CHECK-NEXT: vpsel q1, q3, q2
-; CHECK-NEXT: vmov r0, r1, d2
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov.16 q0[5], r1
-; CHECK-NEXT: vmov r0, r1, d3
-; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: add r0, sp, #24
-; CHECK-NEXT: vmov.16 q0[7], r1
+; CHECK-NEXT: vmov.i8 q0, #0x0
+; CHECK-NEXT: vpsel q3, q2, q0
+; CHECK-NEXT: vstrh.32 q3, [r4, #8]
+; CHECK-NEXT: vldrw.u32 q3, [r1]
+; CHECK-NEXT: vqadd.u32 q3, q3, r0
+; CHECK-NEXT: add r0, sp, #32
+; CHECK-NEXT: vcmp.u32 hi, q1, q3
; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: vcmp.i16 ne, q0, zr
-; CHECK-NEXT: vldr d1, [sp, #16]
+; CHECK-NEXT: vpsel q0, q2, q0
+; CHECK-NEXT: vstrh.32 q0, [r4]
+; CHECK-NEXT: vldr d1, [sp, #24]
+; CHECK-NEXT: vldrw.u32 q2, [r4]
; CHECK-NEXT: vmov d0, r2, r3
+; CHECK-NEXT: vcmp.i16 ne, q2, zr
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmov r0, r1, d0
; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: vpop {d8, d9}
-; CHECK-NEXT: bx lr
+; CHECK-NEXT: add sp, #16
+; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.1:
; CHECK-NEXT: .LCPI3_0:
-; CHECK-NEXT: .long 0 @ 0x0
-; CHECK-NEXT: .long 1 @ 0x1
-; CHECK-NEXT: .long 2 @ 0x2
-; CHECK-NEXT: .long 3 @ 0x3
-; CHECK-NEXT: .LCPI3_1:
; CHECK-NEXT: .long 4 @ 0x4
; CHECK-NEXT: .long 5 @ 0x5
; CHECK-NEXT: .long 6 @ 0x6
; CHECK-NEXT: .long 7 @ 0x7
+; CHECK-NEXT: .LCPI3_1:
+; CHECK-NEXT: .long 0 @ 0x0
+; CHECK-NEXT: .long 1 @ 0x1
+; CHECK-NEXT: .long 2 @ 0x2
+; CHECK-NEXT: .long 3 @ 0x3
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %TC)
%select = select <8 x i1> %active.lane.mask, <8 x i16> %V1, <8 x i16> %V2
ret <8 x i16> %select
@@ -202,122 +195,79 @@ define <8 x i16> @v8i16(i32 %index, i32 %TC, <8 x i16> %V1, <8 x i16> %V2) {
define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) {
; CHECK-LABEL: v16i8:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vpush {d8, d9, d10, d11}
+; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: sub sp, #48
; CHECK-NEXT: adr.w r12, .LCPI4_0
-; CHECK-NEXT: vdup.32 q3, r1
+; CHECK-NEXT: vdup.32 q2, r1
; CHECK-NEXT: vldrw.u32 q0, [r12]
; CHECK-NEXT: vmov.i8 q1, #0xff
+; CHECK-NEXT: add r5, sp, #16
+; CHECK-NEXT: adr r1, .LCPI4_1
; CHECK-NEXT: vqadd.u32 q0, q0, r0
-; CHECK-NEXT: vcmp.u32 hi, q3, q0
+; CHECK-NEXT: adr r4, .LCPI4_3
+; CHECK-NEXT: vcmp.u32 hi, q2, q0
; CHECK-NEXT: vmov.i8 q0, #0x0
-; CHECK-NEXT: vpsel q4, q1, q0
-; CHECK-NEXT: vmov r1, r12, d8
-; CHECK-NEXT: vmov.16 q2[0], r1
-; CHECK-NEXT: vmov.16 q2[1], r12
-; CHECK-NEXT: vmov r1, r12, d9
-; CHECK-NEXT: vmov.16 q2[2], r1
-; CHECK-NEXT: adr r1, .LCPI4_1
-; CHECK-NEXT: vldrw.u32 q4, [r1]
-; CHECK-NEXT: vmov.16 q2[3], r12
-; CHECK-NEXT: vqadd.u32 q4, q4, r0
-; CHECK-NEXT: vcmp.u32 hi, q3, q4
-; CHECK-NEXT: vpsel q4, q1, q0
-; CHECK-NEXT: vmov r1, r12, d8
-; CHECK-NEXT: vmov.16 q2[4], r1
-; CHECK-NEXT: vmov.16 q2[5], r12
-; CHECK-NEXT: vmov r1, r12, d9
-; CHECK-NEXT: vmov.16 q2[6], r1
-; CHECK-NEXT: vmov.16 q2[7], r12
-; CHECK-NEXT: vcmp.i16 ne, q2, zr
-; CHECK-NEXT: vpsel q4, q1, q0
-; CHECK-NEXT: vmov.u16 r1, q4[0]
-; CHECK-NEXT: vmov.8 q2[0], r1
-; CHECK-NEXT: vmov.u16 r1, q4[1]
-; CHECK-NEXT: vmov.8 q2[1], r1
-; CHECK-NEXT: vmov.u16 r1, q4[2]
-; CHECK-NEXT: vmov.8 q2[2], r1
-; CHECK-NEXT: vmov.u16 r1, q4[3]
-; CHECK-NEXT: vmov.8 q2[3], r1
-; CHECK-NEXT: vmov.u16 r1, q4[4]
-; CHECK-NEXT: vmov.8 q2[4], r1
-; CHECK-NEXT: vmov.u16 r1, q4[5]
-; CHECK-NEXT: vmov.8 q2[5], r1
-; CHECK-NEXT: vmov.u16 r1, q4[6]
-; CHECK-NEXT: vmov.8 q2[6], r1
-; CHECK-NEXT: vmov.u16 r1, q4[7]
-; CHECK-NEXT: vmov.8 q2[7], r1
+; CHECK-NEXT: vpsel q3, q1, q0
+; CHECK-NEXT: vstrh.32 q3, [r5, #8]
+; CHECK-NEXT: vldrw.u32 q3, [r1]
; CHECK-NEXT: adr r1, .LCPI4_2
-; CHECK-NEXT: vldrw.u32 q4, [r1]
-; CHECK-NEXT: vqadd.u32 q4, q4, r0
-; CHECK-NEXT: vcmp.u32 hi, q3, q4
-; CHECK-NEXT: vpsel q5, q1, q0
-; CHECK-NEXT: vmov r1, r12, d10
-; CHECK-NEXT: vmov.16 q4[0], r1
-; CHECK-NEXT: vmov.16 q4[1], r12
-; CHECK-NEXT: vmov r1, r12, d11
-; CHECK-NEXT: vmov.16 q4[2], r1
-; CHECK-NEXT: adr r1, .LCPI4_3
-; CHECK-NEXT: vldrw.u32 q5, [r1]
-; CHECK-NEXT: vmov.16 q4[3], r12
-; CHECK-NEXT: vqadd.u32 q5, q5, r0
-; CHECK-NEXT: vcmp.u32 hi, q3, q5
+; CHECK-NEXT: vqadd.u32 q3, q3, r0
+; CHECK-NEXT: vcmp.u32 hi, q2, q3
+; CHECK-NEXT: vpsel q3, q1, q0
+; CHECK-NEXT: vstrh.32 q3, [r5]
+; CHECK-NEXT: vldrw.u32 q3, [r1]
+; CHECK-NEXT: mov r1, sp
+; CHECK-NEXT: vqadd.u32 q3, q3, r0
+; CHECK-NEXT: vcmp.u32 hi, q2, q3
; CHECK-NEXT: vpsel q3, q1, q0
-; CHECK-NEXT: vmov r0, r1, d6
-; CHECK-NEXT: vmov.16 q4[4], r0
-; CHECK-NEXT: vmov.16 q4[5], r1
-; CHECK-NEXT: vmov r0, r1, d7
-; CHECK-NEXT: vmov.16 q4[6], r0
-; CHECK-NEXT: vmov.16 q4[7], r1
-; CHECK-NEXT: vcmp.i16 ne, q4, zr
+; CHECK-NEXT: vstrh.32 q3, [r1, #8]
+; CHECK-NEXT: vldrw.u32 q3, [r4]
+; CHECK-NEXT: vqadd.u32 q3, q3, r0
+; CHECK-NEXT: add r0, sp, #32
+; CHECK-NEXT: vcmp.u32 hi, q2, q3
+; CHECK-NEXT: vpsel q2, q1, q0
+; CHECK-NEXT: vstrh.32 q2, [r1]
+; CHECK-NEXT: vldrw.u32 q2, [r5]
+; CHECK-NEXT: vcmp.i16 ne, q2, zr
+; CHECK-NEXT: vpsel q2, q1, q0
+; CHECK-NEXT: vstrb.16 q2, [r0, #8]
+; CHECK-NEXT: vldrw.u32 q2, [r1]
+; CHECK-NEXT: add r1, sp, #72
+; CHECK-NEXT: vcmp.i16 ne, q2, zr
; CHECK-NEXT: vpsel q0, q1, q0
-; CHECK-NEXT: vmov.u16 r0, q0[0]
-; CHECK-NEXT: vmov.8 q2[8], r0
-; CHECK-NEXT: vmov.u16 r0, q0[1]
-; CHECK-NEXT: vmov.8 q2[9], r0
-; CHECK-NEXT: vmov.u16 r0, q0[2]
-; CHECK-NEXT: vmov.8 q2[10], r0
-; CHECK-NEXT: vmov.u16 r0, q0[3]
-; CHECK-NEXT: vmov.8 q2[11], r0
-; CHECK-NEXT: vmov.u16 r0, q0[4]
-; CHECK-NEXT: vmov.8 q2[12], r0
-; CHECK-NEXT: vmov.u16 r0, q0[5]
-; CHECK-NEXT: vmov.8 q2[13], r0
-; CHECK-NEXT: vmov.u16 r0, q0[6]
-; CHECK-NEXT: vmov.8 q2[14], r0
-; CHECK-NEXT: vmov.u16 r0, q0[7]
-; CHECK-NEXT: vmov.8 q2[15], r0
-; CHECK-NEXT: add r0, sp, #40
-; CHECK-NEXT: vldr d1, [sp, #32]
-; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: vcmp.i8 ne, q2, zr
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vstrb.16 q0, [r0]
+; CHECK-NEXT: vldr d1, [sp, #64]
+; CHECK-NEXT: vldrw.u32 q2, [r0]
; CHECK-NEXT: vmov d0, r2, r3
+; CHECK-NEXT: vcmp.i8 ne, q2, zr
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmov r0, r1, d0
; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: vpop {d8, d9, d10, d11}
-; CHECK-NEXT: bx lr
+; CHECK-NEXT: add sp, #48
+; CHECK-NEXT: pop {r4, r5, r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.1:
; CHECK-NEXT: .LCPI4_0:
-; CHECK-NEXT: .long 0 @ 0x0
-; CHECK-NEXT: .long 1 @ 0x1
-; CHECK-NEXT: .long 2 @ 0x2
-; CHECK-NEXT: .long 3 @ 0x3
+; CHECK-NEXT: .long 12 @ 0xc
+; CHECK-NEXT: .long 13 @ 0xd
+; CHECK-NEXT: .long 14 @ 0xe
+; CHECK-NEXT: .long 15 @ 0xf
; CHECK-NEXT: .LCPI4_1:
-; CHECK-NEXT: .long 4 @ 0x4
-; CHECK-NEXT: .long 5 @ 0x5
-; CHECK-NEXT: .long 6 @ 0x6
-; CHECK-NEXT: .long 7 @ 0x7
-; CHECK-NEXT: .LCPI4_2:
; CHECK-NEXT: .long 8 @ 0x8
; CHECK-NEXT: .long 9 @ 0x9
; CHECK-NEXT: .long 10 @ 0xa
; CHECK-NEXT: .long 11 @ 0xb
+; CHECK-NEXT: .LCPI4_2:
+; CHECK-NEXT: .long 4 @ 0x4
+; CHECK-NEXT: .long 5 @ 0x5
+; CHECK-NEXT: .long 6 @ 0x6
+; CHECK-NEXT: .long 7 @ 0x7
; CHECK-NEXT: .LCPI4_3:
-; CHECK-NEXT: .long 12 @ 0xc
-; CHECK-NEXT: .long 13 @ 0xd
-; CHECK-NEXT: .long 14 @ 0xe
-; CHECK-NEXT: .long 15 @ 0xf
+; CHECK-NEXT: .long 0 @ 0x0
+; CHECK-NEXT: .long 1 @ 0x1
+; CHECK-NEXT: .long 2 @ 0x2
+; CHECK-NEXT: .long 3 @ 0x3
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %TC)
%select = select <16 x i1> %active.lane.mask, <16 x i8> %V1, <16 x i8> %V2
ret <16 x i8> %select
diff --git a/llvm/test/CodeGen/Thumb2/mve-concat.ll b/llvm/test/CodeGen/Thumb2/mve-concat.ll
index 0a3d9be820e6add..8fa8c6c56ee5fdd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-concat.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-concat.ll
@@ -34,30 +34,25 @@ declare <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32)
define arm_aapcs_vfpcc <8 x i16> @concat_v4i1(<4 x i32> %a, <4 x i32> %b, <8 x i16> %c) {
; CHECK-LABEL: concat_v4i1:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10, d11}
-; CHECK-NEXT: vpush {d8, d9, d10, d11}
+; CHECK-NEXT: .vsave {d8, d9}
+; CHECK-NEXT: vpush {d8, d9}
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: vmov.i8 q3, #0x0
; CHECK-NEXT: vmov.i8 q4, #0xff
-; CHECK-NEXT: vcmp.s32 lt, q0, zr
-; CHECK-NEXT: vpsel q5, q4, q3
; CHECK-NEXT: vcmp.s32 lt, q1, zr
-; CHECK-NEXT: vmov r0, r1, d10
+; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vpsel q1, q4, q3
-; CHECK-NEXT: vmov.16 q0[0], r0
-; CHECK-NEXT: vmov.16 q0[1], r1
-; CHECK-NEXT: vmov r0, r1, d11
-; CHECK-NEXT: vmov.16 q0[2], r0
-; CHECK-NEXT: vmov.16 q0[3], r1
-; CHECK-NEXT: vmov r0, r1, d2
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov.16 q0[5], r1
-; CHECK-NEXT: vmov r0, r1, d3
-; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: vmov.16 q0[7], r1
-; CHECK-NEXT: vcmp.i16 ne, q0, zr
+; CHECK-NEXT: vcmp.s32 lt, q0, zr
+; CHECK-NEXT: vpsel q0, q4, q3
+; CHECK-NEXT: vstrh.32 q1, [r0, #8]
+; CHECK-NEXT: vstrh.32 q0, [r0]
; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vcmp.i16 ne, q1, zr
; CHECK-NEXT: vpsel q0, q2, q0
-; CHECK-NEXT: vpop {d8, d9, d10, d11}
+; CHECK-NEXT: add sp, #16
+; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
entry:
%ai = icmp slt <4 x i32> %a, zeroinitializer
@@ -70,50 +65,25 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @concat_v8i1(<8 x i16> %a, <8 x i16> %b, <16 x i8> %c) {
; CHECK-LABEL: concat_v8i1:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10, d11}
-; CHECK-NEXT: vpush {d8, d9, d10, d11}
+; CHECK-NEXT: .vsave {d8, d9}
+; CHECK-NEXT: vpush {d8, d9}
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: vmov.i8 q3, #0x0
; CHECK-NEXT: vmov.i8 q4, #0xff
-; CHECK-NEXT: vcmp.s16 lt, q0, zr
-; CHECK-NEXT: vpsel q5, q4, q3
; CHECK-NEXT: vcmp.s16 lt, q1, zr
-; CHECK-NEXT: vmov.u16 r0, q5[0]
+; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vpsel q1, q4, q3
-; CHECK-NEXT: vmov.8 q0[0], r0
-; CHECK-NEXT: vmov.u16 r0, q5[1]
-; CHECK-NEXT: vmov.8 q0[1], r0
-; CHECK-NEXT: vmov.u16 r0, q5[2]
-; CHECK-NEXT: vmov.8 q0[2], r0
-; CHECK-NEXT: vmov.u16 r0, q5[3]
-; CHECK-NEXT: vmov.8 q0[3], r0
-; CHECK-NEXT: vmov.u16 r0, q5[4]
-; CHECK-NEXT: vmov.8 q0[4], r0
-; CHECK-NEXT: vmov.u16 r0, q5[5]
-; CHECK-NEXT: vmov.8 q0[5], r0
-; CHECK-NEXT: vmov.u16 r0, q5[6]
-; CHECK-NEXT: vmov.8 q0[6], r0
-; CHECK-NEXT: vmov.u16 r0, q5[7]
-; CHECK-NEXT: vmov.8 q0[7], r0
-; CHECK-NEXT: vmov.u16 r0, q1[0]
-; CHECK-NEXT: vmov.8 q0[8], r0
-; CHECK-NEXT: vmov.u16 r0, q1[1]
-; CHECK-NEXT: vmov.8 q0[9], r0
-; CHECK-NEXT: vmov.u16 r0, q1[2]
-; CHECK-NEXT: vmov.8 q0[10], r0
-; CHECK-NEXT: vmov.u16 r0, q1[3]
-; CHECK-NEXT: vmov.8 q0[11], r0
-; CHECK-NEXT: vmov.u16 r0, q1[4]
-; CHECK-NEXT: vmov.8 q0[12], r0
-; CHECK-NEXT: vmov.u16 r0, q1[5]
-; CHECK-NEXT: vmov.8 q0[13], r0
-; CHECK-NEXT: vmov.u16 r0, q1[6]
-; CHECK-NEXT: vmov.8 q0[14], r0
-; CHECK-NEXT: vmov.u16 r0, q1[7]
-; CHECK-NEXT: vmov.8 q0[15], r0
-; CHECK-NEXT: vcmp.i8 ne, q0, zr
+; CHECK-NEXT: vcmp.s16 lt, q0, zr
+; CHECK-NEXT: vpsel q0, q4, q3
+; CHECK-NEXT: vstrb.16 q1, [r0, #8]
+; CHECK-NEXT: vstrb.16 q0, [r0]
; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vcmp.i8 ne, q1, zr
; CHECK-NEXT: vpsel q0, q2, q0
-; CHECK-NEXT: vpop {d8, d9, d10, d11}
+; CHECK-NEXT: add sp, #16
+; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
entry:
%ai = icmp slt <8 x i16> %a, zeroinitializer
@@ -127,84 +97,43 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @concat_v48i1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %d, <4 x i32> %e, <16 x i8> %c) {
; CHECK-LABEL: concat_v48i1:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: .vsave {d8, d9, d10, d11}
+; CHECK-NEXT: vpush {d8, d9, d10, d11}
+; CHECK-NEXT: .pad #48
+; CHECK-NEXT: sub sp, #48
; CHECK-NEXT: vmov.i8 q4, #0x0
; CHECK-NEXT: vmov.i8 q5, #0xff
-; CHECK-NEXT: vcmp.s32 lt, q0, zr
-; CHECK-NEXT: vpsel q6, q5, q4
-; CHECK-NEXT: vcmp.s32 lt, q1, zr
-; CHECK-NEXT: vmov r0, r1, d12
-; CHECK-NEXT: vpsel q1, q5, q4
-; CHECK-NEXT: vmov.16 q0[0], r0
-; CHECK-NEXT: vmov.16 q0[1], r1
-; CHECK-NEXT: vmov r0, r1, d13
-; CHECK-NEXT: vmov.16 q0[2], r0
-; CHECK-NEXT: vmov.16 q0[3], r1
-; CHECK-NEXT: vmov r0, r1, d2
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov.16 q0[5], r1
-; CHECK-NEXT: vmov r0, r1, d3
-; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: vmov.16 q0[7], r1
-; CHECK-NEXT: vcmp.i16 ne, q0, zr
-; CHECK-NEXT: vpsel q1, q5, q4
-; CHECK-NEXT: vcmp.s32 lt, q2, zr
-; CHECK-NEXT: vmov.u16 r0, q1[0]
-; CHECK-NEXT: vpsel q2, q5, q4
-; CHECK-NEXT: vmov.8 q0[0], r0
-; CHECK-NEXT: vmov.u16 r0, q1[1]
-; CHECK-NEXT: vmov.8 q0[1], r0
-; CHECK-NEXT: vmov.u16 r0, q1[2]
-; CHECK-NEXT: vmov.8 q0[2], r0
-; CHECK-NEXT: vmov.u16 r0, q1[3]
-; CHECK-NEXT: vmov.8 q0[3], r0
-; CHECK-NEXT: vmov.u16 r0, q1[4]
-; CHECK-NEXT: vmov.8 q0[4], r0
-; CHECK-NEXT: vmov.u16 r0, q1[5]
-; CHECK-NEXT: vmov.8 q0[5], r0
-; CHECK-NEXT: vmov.u16 r0, q1[6]
-; CHECK-NEXT: vmov.8 q0[6], r0
-; CHECK-NEXT: vmov.u16 r0, q1[7]
-; CHECK-NEXT: vmov.8 q0[7], r0
-; CHECK-NEXT: vmov r0, r1, d4
-; CHECK-NEXT: vmov.16 q1[0], r0
; CHECK-NEXT: vcmp.s32 lt, q3, zr
-; CHECK-NEXT: vmov.16 q1[1], r1
-; CHECK-NEXT: vmov r0, r1, d5
-; CHECK-NEXT: vmov.16 q1[2], r0
+; CHECK-NEXT: add r0, sp, #16
+; CHECK-NEXT: vpsel q3, q5, q4
+; CHECK-NEXT: vcmp.s32 lt, q2, zr
; CHECK-NEXT: vpsel q2, q5, q4
-; CHECK-NEXT: vmov.16 q1[3], r1
-; CHECK-NEXT: vmov r0, r1, d4
-; CHECK-NEXT: vmov.16 q1[4], r0
-; CHECK-NEXT: vmov.16 q1[5], r1
-; CHECK-NEXT: vmov r0, r1, d5
-; CHECK-NEXT: vmov.16 q1[6], r0
-; CHECK-NEXT: vmov.16 q1[7], r1
-; CHECK-NEXT: vcmp.i16 ne, q1, zr
+; CHECK-NEXT: vcmp.s32 lt, q1, zr
; CHECK-NEXT: vpsel q1, q5, q4
-; CHECK-NEXT: vmov.u16 r0, q1[0]
-; CHECK-NEXT: vmov.8 q0[8], r0
-; CHECK-NEXT: vmov.u16 r0, q1[1]
-; CHECK-NEXT: vmov.8 q0[9], r0
-; CHECK-NEXT: vmov.u16 r0, q1[2]
-; CHECK-NEXT: vmov.8 q0[10], r0
-; CHECK-NEXT: vmov.u16 r0, q1[3]
-; CHECK-NEXT: vmov.8 q0[11], r0
-; CHECK-NEXT: vmov.u16 r0, q1[4]
-; CHECK-NEXT: vmov.8 q0[12], r0
-; CHECK-NEXT: vmov.u16 r0, q1[5]
-; CHECK-NEXT: vmov.8 q0[13], r0
-; CHECK-NEXT: vmov.u16 r0, q1[6]
-; CHECK-NEXT: vmov.8 q0[14], r0
-; CHECK-NEXT: vmov.u16 r0, q1[7]
-; CHECK-NEXT: vmov.8 q0[15], r0
-; CHECK-NEXT: add r0, sp, #48
-; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: vcmp.i8 ne, q0, zr
+; CHECK-NEXT: vcmp.s32 lt, q0, zr
+; CHECK-NEXT: mov r1, sp
+; CHECK-NEXT: vpsel q0, q5, q4
+; CHECK-NEXT: vstrh.32 q3, [r0, #8]
+; CHECK-NEXT: vstrh.32 q2, [r0]
+; CHECK-NEXT: vstrh.32 q1, [r1, #8]
+; CHECK-NEXT: vstrh.32 q0, [r1]
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: add r0, sp, #32
+; CHECK-NEXT: vcmp.i16 ne, q0, zr
+; CHECK-NEXT: vpsel q0, q5, q4
+; CHECK-NEXT: vstrb.16 q0, [r0, #8]
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: add r1, sp, #80
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vcmp.i16 ne, q0, zr
+; CHECK-NEXT: vpsel q0, q5, q4
+; CHECK-NEXT: vstrb.16 q0, [r0]
; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: vldrw.u32 q2, [r0]
+; CHECK-NEXT: vcmp.i8 ne, q2, zr
; CHECK-NEXT: vpsel q0, q1, q0
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: add sp, #48
+; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: bx lr
entry:
%ai = icmp slt <4 x i32> %a, zeroinitializer
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
index cfdb20d15e938ba..9987ff940b5aac1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
@@ -62,40 +62,30 @@ entry:
define arm_aapcs_vfpcc i32 @mlapred_v4i32_v4i64_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: mlapred_v4i32_v4i64_zext:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-NEXT: .pad #16
-; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: .pad #32
+; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: vorr q2, q2, q3
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vstrw.32 q2, [r0]
; CHECK-NEXT: vmov.i8 q3, #0xff
; CHECK-NEXT: vldrh.u32 q2, [r0, #8]
; CHECK-NEXT: vldrh.u32 q5, [r0]
+; CHECK-NEXT: add r0, sp, #16
; CHECK-NEXT: vcmp.i32 eq, q2, zr
; CHECK-NEXT: vmov.i8 q2, #0x0
; CHECK-NEXT: vpsel q4, q3, q2
; CHECK-NEXT: vcmp.i32 eq, q5, zr
; CHECK-NEXT: vpsel q2, q3, q2
-; CHECK-NEXT: vmov r2, r3, d8
-; CHECK-NEXT: vmov r4, r5, d4
-; CHECK-NEXT: vmov r0, r1, d5
-; CHECK-NEXT: vmov.16 q2[0], r4
-; CHECK-NEXT: vmov.16 q2[1], r5
-; CHECK-NEXT: vmov r12, lr, d9
-; CHECK-NEXT: vmov.16 q2[2], r0
-; CHECK-NEXT: vmov.16 q2[3], r1
-; CHECK-NEXT: vmov.16 q2[4], r2
-; CHECK-NEXT: vmov.16 q2[5], r3
-; CHECK-NEXT: vmov.16 q2[6], r12
-; CHECK-NEXT: vmov.16 q2[7], lr
+; CHECK-NEXT: vstrh.32 q4, [r0, #8]
+; CHECK-NEXT: vstrh.32 q2, [r0]
+; CHECK-NEXT: vldrw.u32 q2, [r0]
; CHECK-NEXT: vpt.i16 ne, q2, zr
; CHECK-NEXT: vmlavt.u16 r0, q0, q1
-; CHECK-NEXT: add sp, #16
+; CHECK-NEXT: add sp, #32
; CHECK-NEXT: vpop {d8, d9, d10, d11}
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: bx lr
entry:
%aa = zext <8 x i16> %a to <8 x i32>
%bb = zext <8 x i16> %b to <8 x i32>
diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
index 72df912b25a9fc3..d2f79fcd5fd9826 100644
--- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
@@ -6,52 +6,42 @@
define arm_aapcs_vfpcc void @k() {
; CHECK-LABEL: k:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT: .pad #4
-; CHECK-NEXT: sub sp, #4
+; CHECK-NEXT: .save {r4, r5, r6, lr}
+; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14}
-; CHECK-NEXT: .pad #16
-; CHECK-NEXT: sub sp, #16
-; CHECK-NEXT: adr.w r8, .LCPI0_0
-; CHECK-NEXT: adr.w r9, .LCPI0_1
-; CHECK-NEXT: vldrw.u32 q6, [r8]
-; CHECK-NEXT: vldrw.u32 q5, [r9]
+; CHECK-NEXT: .pad #32
+; CHECK-NEXT: sub sp, #32
+; CHECK-NEXT: adr r5, .LCPI0_0
+; CHECK-NEXT: adr r4, .LCPI0_1
+; CHECK-NEXT: vldrw.u32 q6, [r5]
+; CHECK-NEXT: vldrw.u32 q5, [r4]
+; CHECK-NEXT: add r0, sp, #16
; CHECK-NEXT: vmov.i32 q0, #0x1
; CHECK-NEXT: vmov.i8 q1, #0x0
; CHECK-NEXT: vmov.i8 q2, #0xff
; CHECK-NEXT: vmov.i16 q3, #0x6
; CHECK-NEXT: vmov.i16 q4, #0x3
-; CHECK-NEXT: mov.w r12, #0
+; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: .LBB0_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vand q6, q6, q0
; CHECK-NEXT: vand q5, q5, q0
-; CHECK-NEXT: vcmp.i32 eq, q6, zr
-; CHECK-NEXT: cmp.w r12, #0
-; CHECK-NEXT: vpsel q6, q2, q1
+; CHECK-NEXT: vand q6, q6, q0
; CHECK-NEXT: vcmp.i32 eq, q5, zr
; CHECK-NEXT: vpsel q5, q2, q1
-; CHECK-NEXT: vmov r4, r0, d12
-; CHECK-NEXT: vmov r3, r6, d10
-; CHECK-NEXT: vmov r1, r2, d11
-; CHECK-NEXT: vmov.16 q5[0], r3
-; CHECK-NEXT: vmov.16 q5[1], r6
-; CHECK-NEXT: vmov r5, r7, d13
-; CHECK-NEXT: vmov.16 q5[2], r1
-; CHECK-NEXT: vmov.16 q5[3], r2
-; CHECK-NEXT: vmov.16 q5[4], r4
-; CHECK-NEXT: vmov.16 q5[5], r0
-; CHECK-NEXT: vmov.16 q5[6], r5
-; CHECK-NEXT: vmov.16 q5[7], r7
+; CHECK-NEXT: vcmp.i32 eq, q6, zr
+; CHECK-NEXT: vpsel q6, q2, q1
+; CHECK-NEXT: vstrh.32 q5, [r0]
+; CHECK-NEXT: vstrh.32 q6, [r0, #8]
+; CHECK-NEXT: vldrw.u32 q5, [r0]
; CHECK-NEXT: vcmp.i16 ne, q5, zr
; CHECK-NEXT: vmov.i32 q5, #0x0
; CHECK-NEXT: vpsel q6, q4, q3
; CHECK-NEXT: vstrh.16 q6, [r0]
; CHECK-NEXT: vmov q6, q5
-; CHECK-NEXT: bne .LBB0_1
-; CHECK-NEXT: @ %bb.2: @ %for.cond4.preheader
+; CHECK-NEXT: cbz r1, .LBB0_2
+; CHECK-NEXT: le .LBB0_1
+; CHECK-NEXT: .LBB0_2: @ %for.cond4.preheader
; CHECK-NEXT: movs r6, #0
; CHECK-NEXT: cbnz r6, .LBB0_5
; CHECK-NEXT: .LBB0_3: @ %for.body10
@@ -61,8 +51,8 @@ define arm_aapcs_vfpcc void @k() {
; CHECK-NEXT: .LBB0_4: @ %for.cond4.loopexit
; CHECK-NEXT: bl l
; CHECK-NEXT: .LBB0_5: @ %vector.body105.preheader
-; CHECK-NEXT: vldrw.u32 q0, [r8]
-; CHECK-NEXT: vldrw.u32 q1, [r9]
+; CHECK-NEXT: vldrw.u32 q0, [r5]
+; CHECK-NEXT: vldrw.u32 q1, [r4]
; CHECK-NEXT: movs r0, #8
; CHECK-NEXT: .LBB0_6: @ %vector.body105
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
@@ -71,7 +61,7 @@ define arm_aapcs_vfpcc void @k() {
; CHECK-NEXT: cbz r6, .LBB0_7
; CHECK-NEXT: le .LBB0_6
; CHECK-NEXT: .LBB0_7: @ %vector.body115.ph
-; CHECK-NEXT: vldrw.u32 q0, [r9]
+; CHECK-NEXT: vldrw.u32 q0, [r4]
; CHECK-NEXT: movs r0, #4
; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
; CHECK-NEXT: @APP
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
index cca15165e012eb3..a55a8e4a68aebbb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
@@ -640,37 +640,34 @@ entry:
define <8 x i16> @shuffle6_v4i32(<4 x i32> %src1, <4 x i32> %src2, <8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: shuffle6_v4i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov d1, r2, r3
-; CHECK-NEXT: vmov.i8 q1, #0x0
-; CHECK-NEXT: vmov d0, r0, r1
-; CHECK-NEXT: vmov.i8 q2, #0xff
-; CHECK-NEXT: vcmp.i32 eq, q0, zr
-; CHECK-NEXT: vpsel q3, q2, q1
-; CHECK-NEXT: vmov r0, r1, d6
-; CHECK-NEXT: vmov.16 q0[0], r0
-; CHECK-NEXT: vmov.16 q0[1], r1
-; CHECK-NEXT: vmov r0, r1, d7
-; CHECK-NEXT: vmov.16 q0[2], r0
-; CHECK-NEXT: mov r0, sp
-; CHECK-NEXT: vldrw.u32 q3, [r0]
-; CHECK-NEXT: vmov.16 q0[3], r1
-; CHECK-NEXT: vcmp.i32 eq, q3, zr
-; CHECK-NEXT: vpsel q1, q2, q1
-; CHECK-NEXT: vmov r0, r1, d2
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov.16 q0[5], r1
-; CHECK-NEXT: vmov r0, r1, d3
-; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: add r0, sp, #32
-; CHECK-NEXT: vmov.16 q0[7], r1
-; CHECK-NEXT: vcmp.i16 ne, q0, zr
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: add.w r12, sp, #24
+; CHECK-NEXT: vmov.i8 q0, #0x0
+; CHECK-NEXT: vldrw.u32 q2, [r12]
+; CHECK-NEXT: vmov.i8 q1, #0xff
+; CHECK-NEXT: mov r4, sp
+; CHECK-NEXT: vcmp.i32 eq, q2, zr
+; CHECK-NEXT: vpsel q2, q1, q0
+; CHECK-NEXT: vstrh.32 q2, [r4, #8]
+; CHECK-NEXT: vmov d4, r0, r1
+; CHECK-NEXT: vmov d5, r2, r3
+; CHECK-NEXT: add r0, sp, #56
+; CHECK-NEXT: vcmp.i32 eq, q2, zr
+; CHECK-NEXT: vpsel q0, q1, q0
+; CHECK-NEXT: vstrh.32 q0, [r4]
; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: add r0, sp, #16
+; CHECK-NEXT: add r0, sp, #40
+; CHECK-NEXT: vldrw.u32 q2, [r4]
; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vcmp.i16 ne, q2, zr
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vmov r0, r1, d0
; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: bx lr
+; CHECK-NEXT: add sp, #16
+; CHECK-NEXT: pop {r4, pc}
entry:
%c1 = icmp eq <4 x i32> %src1, zeroinitializer
%c2 = icmp eq <4 x i32> %src2, zeroinitializer
@@ -682,57 +679,34 @@ entry:
define <16 x i8> @shuffle6_v8i16(<8 x i16> %src1, <8 x i16> %src2, <16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: shuffle6_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov d1, r2, r3
-; CHECK-NEXT: vmov.i8 q1, #0x0
-; CHECK-NEXT: vmov d0, r0, r1
-; CHECK-NEXT: vmov.i8 q2, #0xff
-; CHECK-NEXT: vcmp.i16 eq, q0, zr
-; CHECK-NEXT: vpsel q3, q2, q1
-; CHECK-NEXT: vmov.u16 r0, q3[0]
-; CHECK-NEXT: vmov.8 q0[0], r0
-; CHECK-NEXT: vmov.u16 r0, q3[1]
-; CHECK-NEXT: vmov.8 q0[1], r0
-; CHECK-NEXT: vmov.u16 r0, q3[2]
-; CHECK-NEXT: vmov.8 q0[2], r0
-; CHECK-NEXT: vmov.u16 r0, q3[3]
-; CHECK-NEXT: vmov.8 q0[3], r0
-; CHECK-NEXT: vmov.u16 r0, q3[4]
-; CHECK-NEXT: vmov.8 q0[4], r0
-; CHECK-NEXT: vmov.u16 r0, q3[5]
-; CHECK-NEXT: vmov.8 q0[5], r0
-; CHECK-NEXT: vmov.u16 r0, q3[6]
-; CHECK-NEXT: vmov.8 q0[6], r0
-; CHECK-NEXT: vmov.u16 r0, q3[7]
-; CHECK-NEXT: vmov.8 q0[7], r0
-; CHECK-NEXT: mov r0, sp
-; CHECK-NEXT: vldrw.u32 q3, [r0]
-; CHECK-NEXT: vcmp.i16 eq, q3, zr
-; CHECK-NEXT: vpsel q1, q2, q1
-; CHECK-NEXT: vmov.u16 r0, q1[0]
-; CHECK-NEXT: vmov.8 q0[8], r0
-; CHECK-NEXT: vmov.u16 r0, q1[1]
-; CHECK-NEXT: vmov.8 q0[9], r0
-; CHECK-NEXT: vmov.u16 r0, q1[2]
-; CHECK-NEXT: vmov.8 q0[10], r0
-; CHECK-NEXT: vmov.u16 r0, q1[3]
-; CHECK-NEXT: vmov.8 q0[11], r0
-; CHECK-NEXT: vmov.u16 r0, q1[4]
-; CHECK-NEXT: vmov.8 q0[12], r0
-; CHECK-NEXT: vmov.u16 r0, q1[5]
-; CHECK-NEXT: vmov.8 q0[13], r0
-; CHECK-NEXT: vmov.u16 r0, q1[6]
-; CHECK-NEXT: vmov.8 q0[14], r0
-; CHECK-NEXT: vmov.u16 r0, q1[7]
-; CHECK-NEXT: vmov.8 q0[15], r0
-; CHECK-NEXT: add r0, sp, #32
-; CHECK-NEXT: vcmp.i8 ne, q0, zr
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: add.w r12, sp, #24
+; CHECK-NEXT: vmov.i8 q0, #0x0
+; CHECK-NEXT: vldrw.u32 q2, [r12]
+; CHECK-NEXT: vmov.i8 q1, #0xff
+; CHECK-NEXT: mov r4, sp
+; CHECK-NEXT: vcmp.i16 eq, q2, zr
+; CHECK-NEXT: vpsel q2, q1, q0
+; CHECK-NEXT: vstrb.16 q2, [r4, #8]
+; CHECK-NEXT: vmov d4, r0, r1
+; CHECK-NEXT: vmov d5, r2, r3
+; CHECK-NEXT: add r0, sp, #56
+; CHECK-NEXT: vcmp.i16 eq, q2, zr
+; CHECK-NEXT: vpsel q0, q1, q0
+; CHECK-NEXT: vstrb.16 q0, [r4]
; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: add r0, sp, #16
+; CHECK-NEXT: add r0, sp, #40
+; CHECK-NEXT: vldrw.u32 q2, [r4]
; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vcmp.i8 ne, q2, zr
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vmov r0, r1, d0
; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: bx lr
+; CHECK-NEXT: add sp, #16
+; CHECK-NEXT: pop {r4, pc}
entry:
%c1 = icmp eq <8 x i16> %src1, zeroinitializer
%c2 = icmp eq <8 x i16> %src2, zeroinitializer
diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
index b5d981ef340254d..fef2c39e08827e8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -1386,49 +1386,43 @@ for.cond.cleanup: ; preds = %vector.body, %entry
define arm_aapcs_vfpcc void @ssatmul_8t_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
; CHECK-LABEL: ssatmul_8t_q15:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, lr}
-; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: beq .LBB9_3
; CHECK-NEXT: @ %bb.1: @ %vector.ph
-; CHECK-NEXT: add.w r12, r3, #7
+; CHECK-NEXT: adds r4, r3, #7
+; CHECK-NEXT: vmov.i8 q2, #0x0
+; CHECK-NEXT: bic r4, r4, #7
+; CHECK-NEXT: vmov.i8 q3, #0xff
+; CHECK-NEXT: sub.w r12, r4, #8
+; CHECK-NEXT: movs r4, #1
+; CHECK-NEXT: mov r5, sp
+; CHECK-NEXT: add.w lr, r4, r12, lsr #3
; CHECK-NEXT: adr r4, .LCPI9_0
-; CHECK-NEXT: bic r12, r12, #7
-; CHECK-NEXT: mov.w lr, #1
-; CHECK-NEXT: sub.w r12, r12, #8
; CHECK-NEXT: vldrw.u32 q0, [r4]
; CHECK-NEXT: adr r4, .LCPI9_1
-; CHECK-NEXT: vmov.i8 q2, #0x0
-; CHECK-NEXT: add.w lr, lr, r12, lsr #3
; CHECK-NEXT: sub.w r12, r3, #1
; CHECK-NEXT: vldrw.u32 q4, [r4]
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vdup.32 q1, r12
-; CHECK-NEXT: vmov.i8 q3, #0xff
; CHECK-NEXT: .LBB9_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vdup.32 q6, r3
+; CHECK-NEXT: vdup.32 q5, r3
; CHECK-NEXT: adds r3, #8
-; CHECK-NEXT: vorr q5, q6, q0
-; CHECK-NEXT: vorr q6, q6, q4
-; CHECK-NEXT: vcmp.u32 cs, q1, q5
-; CHECK-NEXT: vpsel q7, q3, q2
+; CHECK-NEXT: vorr q6, q5, q0
+; CHECK-NEXT: vorr q5, q5, q4
; CHECK-NEXT: vcmp.u32 cs, q1, q6
-; CHECK-NEXT: vmov r4, r12, d14
; CHECK-NEXT: vpsel q6, q3, q2
-; CHECK-NEXT: vmov.16 q5[0], r4
-; CHECK-NEXT: vmov.16 q5[1], r12
-; CHECK-NEXT: vmov r4, r12, d15
-; CHECK-NEXT: vmov.16 q5[2], r4
-; CHECK-NEXT: vmov.16 q5[3], r12
-; CHECK-NEXT: vmov r4, r12, d12
-; CHECK-NEXT: vmov.16 q5[4], r4
-; CHECK-NEXT: vmov.16 q5[5], r12
-; CHECK-NEXT: vmov r4, r12, d13
-; CHECK-NEXT: vmov.16 q5[6], r4
-; CHECK-NEXT: vmov.16 q5[7], r12
+; CHECK-NEXT: vcmp.u32 cs, q1, q5
+; CHECK-NEXT: vpsel q5, q3, q2
+; CHECK-NEXT: vstrh.32 q6, [r5, #8]
+; CHECK-NEXT: vstrh.32 q5, [r5]
+; CHECK-NEXT: vldrw.u32 q5, [r5]
; CHECK-NEXT: vptt.i16 ne, q5, zr
; CHECK-NEXT: vldrht.u16 q5, [r0], #16
; CHECK-NEXT: vldrht.u16 q6, [r1], #16
@@ -1440,20 +1434,21 @@ define arm_aapcs_vfpcc void @ssatmul_8t_q15(ptr nocapture readonly %pSrcA, ptr n
; CHECK-NEXT: vstrht.16 q5, [r2], #16
; CHECK-NEXT: le lr, .LBB9_2
; CHECK-NEXT: .LBB9_3: @ %for.cond.cleanup
+; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: pop {r4, pc}
+; CHECK-NEXT: pop {r4, r5, r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.4:
; CHECK-NEXT: .LCPI9_0:
-; CHECK-NEXT: .long 0 @ 0x0
-; CHECK-NEXT: .long 1 @ 0x1
-; CHECK-NEXT: .long 2 @ 0x2
-; CHECK-NEXT: .long 3 @ 0x3
-; CHECK-NEXT: .LCPI9_1:
; CHECK-NEXT: .long 4 @ 0x4
; CHECK-NEXT: .long 5 @ 0x5
; CHECK-NEXT: .long 6 @ 0x6
; CHECK-NEXT: .long 7 @ 0x7
+; CHECK-NEXT: .LCPI9_1:
+; CHECK-NEXT: .long 0 @ 0x0
+; CHECK-NEXT: .long 1 @ 0x1
+; CHECK-NEXT: .long 2 @ 0x2
+; CHECK-NEXT: .long 3 @ 0x3
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
@@ -1498,49 +1493,43 @@ for.cond.cleanup: ; preds = %vector.body, %entry
define arm_aapcs_vfpcc void @ssatmul_8ti_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
; CHECK-LABEL: ssatmul_8ti_q15:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, lr}
-; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: beq .LBB10_3
; CHECK-NEXT: @ %bb.1: @ %vector.ph
-; CHECK-NEXT: add.w r12, r3, #7
+; CHECK-NEXT: adds r4, r3, #7
+; CHECK-NEXT: vmov.i8 q2, #0x0
+; CHECK-NEXT: bic r4, r4, #7
+; CHECK-NEXT: vmov.i8 q3, #0xff
+; CHECK-NEXT: sub.w r12, r4, #8
+; CHECK-NEXT: movs r4, #1
+; CHECK-NEXT: mov r5, sp
+; CHECK-NEXT: add.w lr, r4, r12, lsr #3
; CHECK-NEXT: adr r4, .LCPI10_0
-; CHECK-NEXT: bic r12, r12, #7
-; CHECK-NEXT: mov.w lr, #1
-; CHECK-NEXT: sub.w r12, r12, #8
; CHECK-NEXT: vldrw.u32 q0, [r4]
; CHECK-NEXT: adr r4, .LCPI10_1
-; CHECK-NEXT: vmov.i8 q2, #0x0
-; CHECK-NEXT: add.w lr, lr, r12, lsr #3
; CHECK-NEXT: sub.w r12, r3, #1
; CHECK-NEXT: vldrw.u32 q4, [r4]
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vdup.32 q1, r12
-; CHECK-NEXT: vmov.i8 q3, #0xff
; CHECK-NEXT: .LBB10_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vdup.32 q6, r3
+; CHECK-NEXT: vdup.32 q5, r3
; CHECK-NEXT: adds r3, #8
-; CHECK-NEXT: vorr q5, q6, q0
-; CHECK-NEXT: vorr q6, q6, q4
-; CHECK-NEXT: vcmp.u32 cs, q1, q5
-; CHECK-NEXT: vpsel q7, q3, q2
+; CHECK-NEXT: vorr q6, q5, q0
+; CHECK-NEXT: vorr q5, q5, q4
; CHECK-NEXT: vcmp.u32 cs, q1, q6
-; CHECK-NEXT: vmov r4, r12, d14
; CHECK-NEXT: vpsel q6, q3, q2
-; CHECK-NEXT: vmov.16 q5[0], r4
-; CHECK-NEXT: vmov.16 q5[1], r12
-; CHECK-NEXT: vmov r4, r12, d15
-; CHECK-NEXT: vmov.16 q5[2], r4
-; CHECK-NEXT: vmov.16 q5[3], r12
-; CHECK-NEXT: vmov r4, r12, d12
-; CHECK-NEXT: vmov.16 q5[4], r4
-; CHECK-NEXT: vmov.16 q5[5], r12
-; CHECK-NEXT: vmov r4, r12, d13
-; CHECK-NEXT: vmov.16 q5[6], r4
-; CHECK-NEXT: vmov.16 q5[7], r12
+; CHECK-NEXT: vcmp.u32 cs, q1, q5
+; CHECK-NEXT: vpsel q5, q3, q2
+; CHECK-NEXT: vstrh.32 q6, [r5, #8]
+; CHECK-NEXT: vstrh.32 q5, [r5]
+; CHECK-NEXT: vldrw.u32 q5, [r5]
; CHECK-NEXT: vptt.i16 ne, q5, zr
; CHECK-NEXT: vldrht.u16 q5, [r0], #16
; CHECK-NEXT: vldrht.u16 q6, [r1], #16
@@ -1552,20 +1541,21 @@ define arm_aapcs_vfpcc void @ssatmul_8ti_q15(ptr nocapture readonly %pSrcA, ptr
; CHECK-NEXT: vstrht.16 q5, [r2], #16
; CHECK-NEXT: le lr, .LBB10_2
; CHECK-NEXT: .LBB10_3: @ %for.cond.cleanup
+; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: pop {r4, pc}
+; CHECK-NEXT: pop {r4, r5, r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.4:
; CHECK-NEXT: .LCPI10_0:
-; CHECK-NEXT: .long 0 @ 0x0
-; CHECK-NEXT: .long 1 @ 0x1
-; CHECK-NEXT: .long 2 @ 0x2
-; CHECK-NEXT: .long 3 @ 0x3
-; CHECK-NEXT: .LCPI10_1:
; CHECK-NEXT: .long 4 @ 0x4
; CHECK-NEXT: .long 5 @ 0x5
; CHECK-NEXT: .long 6 @ 0x6
; CHECK-NEXT: .long 7 @ 0x7
+; CHECK-NEXT: .LCPI10_1:
+; CHECK-NEXT: .long 0 @ 0x0
+; CHECK-NEXT: .long 1 @ 0x1
+; CHECK-NEXT: .long 2 @ 0x2
+; CHECK-NEXT: .long 3 @ 0x3
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
@@ -2386,49 +2376,43 @@ for.body: ; preds = %for.body, %for.body
define arm_aapcs_vfpcc void @ssatmul_8t_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
; CHECK-LABEL: ssatmul_8t_q7:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, lr}
-; CHECK-NEXT: push {r4, lr}
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: beq .LBB17_3
; CHECK-NEXT: @ %bb.1: @ %vector.ph
-; CHECK-NEXT: add.w r12, r3, #7
+; CHECK-NEXT: adds r4, r3, #7
+; CHECK-NEXT: vmov.i8 q2, #0x0
+; CHECK-NEXT: bic r4, r4, #7
+; CHECK-NEXT: vmov.i8 q3, #0xff
+; CHECK-NEXT: sub.w r12, r4, #8
+; CHECK-NEXT: movs r4, #1
+; CHECK-NEXT: mov r5, sp
+; CHECK-NEXT: add.w lr, r4, r12, lsr #3
; CHECK-NEXT: adr r4, .LCPI17_0
-; CHECK-NEXT: bic r12, r12, #7
-; CHECK-NEXT: mov.w lr, #1
-; CHECK-NEXT: sub.w r12, r12, #8
; CHECK-NEXT: vldrw.u32 q0, [r4]
; CHECK-NEXT: adr r4, .LCPI17_1
-; CHECK-NEXT: vmov.i8 q2, #0x0
-; CHECK-NEXT: add.w lr, lr, r12, lsr #3
; CHECK-NEXT: sub.w r12, r3, #1
; CHECK-NEXT: vldrw.u32 q4, [r4]
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vdup.32 q1, r12
-; CHECK-NEXT: vmov.i8 q3, #0xff
; CHECK-NEXT: .LBB17_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vdup.32 q6, r3
+; CHECK-NEXT: vdup.32 q5, r3
; CHECK-NEXT: adds r3, #8
-; CHECK-NEXT: vorr q5, q6, q0
-; CHECK-NEXT: vorr q6, q6, q4
-; CHECK-NEXT: vcmp.u32 cs, q1, q5
-; CHECK-NEXT: vpsel q7, q3, q2
+; CHECK-NEXT: vorr q6, q5, q0
+; CHECK-NEXT: vorr q5, q5, q4
; CHECK-NEXT: vcmp.u32 cs, q1, q6
-; CHECK-NEXT: vmov r4, r12, d14
; CHECK-NEXT: vpsel q6, q3, q2
-; CHECK-NEXT: vmov.16 q5[0], r4
-; CHECK-NEXT: vmov.16 q5[1], r12
-; CHECK-NEXT: vmov r4, r12, d15
-; CHECK-NEXT: vmov.16 q5[2], r4
-; CHECK-NEXT: vmov.16 q5[3], r12
-; CHECK-NEXT: vmov r4, r12, d12
-; CHECK-NEXT: vmov.16 q5[4], r4
-; CHECK-NEXT: vmov.16 q5[5], r12
-; CHECK-NEXT: vmov r4, r12, d13
-; CHECK-NEXT: vmov.16 q5[6], r4
-; CHECK-NEXT: vmov.16 q5[7], r12
+; CHECK-NEXT: vcmp.u32 cs, q1, q5
+; CHECK-NEXT: vpsel q5, q3, q2
+; CHECK-NEXT: vstrh.32 q6, [r5, #8]
+; CHECK-NEXT: vstrh.32 q5, [r5]
+; CHECK-NEXT: vldrw.u32 q5, [r5]
; CHECK-NEXT: vptt.i16 ne, q5, zr
; CHECK-NEXT: vldrbt.s16 q5, [r0], #8
; CHECK-NEXT: vldrbt.s16 q6, [r1], #8
@@ -2438,20 +2422,21 @@ define arm_aapcs_vfpcc void @ssatmul_8t_q7(ptr nocapture readonly %pSrcA, ptr no
; CHECK-NEXT: vstrbt.16 q5, [r2], #8
; CHECK-NEXT: le lr, .LBB17_2
; CHECK-NEXT: .LBB17_3: @ %for.cond.cleanup
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: pop {r4, pc}
+; CHECK-NEXT: add sp, #16
+; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: pop {r4, r5, r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.4:
; CHECK-NEXT: .LCPI17_0:
-; CHECK-NEXT: .long 0 @ 0x0
-; CHECK-NEXT: .long 1 @ 0x1
-; CHECK-NEXT: .long 2 @ 0x2
-; CHECK-NEXT: .long 3 @ 0x3
-; CHECK-NEXT: .LCPI17_1:
; CHECK-NEXT: .long 4 @ 0x4
; CHECK-NEXT: .long 5 @ 0x5
; CHECK-NEXT: .long 6 @ 0x6
; CHECK-NEXT: .long 7 @ 0x7
+; CHECK-NEXT: .LCPI17_1:
+; CHECK-NEXT: .long 0 @ 0x0
+; CHECK-NEXT: .long 1 @ 0x1
+; CHECK-NEXT: .long 2 @ 0x2
+; CHECK-NEXT: .long 3 @ 0x3
entry:
%cmp10 = icmp eq i32 %N, 0
br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
@@ -2496,151 +2481,105 @@ for.cond.cleanup: ; preds = %vector.body, %entry
define arm_aapcs_vfpcc void @ssatmul_16t_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
; CHECK-LABEL: ssatmul_16t_q7:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: .save {r4, r5, r6, lr}
+; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #48
-; CHECK-NEXT: sub sp, #48
+; CHECK-NEXT: .pad #80
+; CHECK-NEXT: sub sp, #80
; CHECK-NEXT: cmp r3, #0
-; CHECK-NEXT: beq.w .LBB18_3
+; CHECK-NEXT: beq .LBB18_3
; CHECK-NEXT: @ %bb.1: @ %vector.ph
-; CHECK-NEXT: add.w r12, r3, #15
-; CHECK-NEXT: adr r4, .LCPI18_0
-; CHECK-NEXT: bic r12, r12, #15
-; CHECK-NEXT: vldrw.u32 q0, [r4]
-; CHECK-NEXT: sub.w r12, r12, #16
-; CHECK-NEXT: mov.w lr, #1
-; CHECK-NEXT: adr r4, .LCPI18_1
-; CHECK-NEXT: movs r5, #0
-; CHECK-NEXT: add.w lr, lr, r12, lsr #4
-; CHECK-NEXT: sub.w r12, r3, #1
-; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [r4]
-; CHECK-NEXT: adr r4, .LCPI18_2
-; CHECK-NEXT: vdup.32 q1, r12
+; CHECK-NEXT: add.w r6, r3, #15
+; CHECK-NEXT: movs r5, #1
+; CHECK-NEXT: bic r6, r6, #15
+; CHECK-NEXT: add r4, sp, #48
+; CHECK-NEXT: subs r6, #16
; CHECK-NEXT: vmov.i8 q2, #0x0
-; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [r4]
-; CHECK-NEXT: adr r4, .LCPI18_3
; CHECK-NEXT: vmov.i8 q3, #0xff
-; CHECK-NEXT: vldrw.u32 q6, [r4]
+; CHECK-NEXT: add.w lr, r5, r6, lsr #4
+; CHECK-NEXT: adr r5, .LCPI18_0
+; CHECK-NEXT: subs r6, r3, #1
+; CHECK-NEXT: vldrw.u32 q0, [r5]
+; CHECK-NEXT: vdup.32 q1, r6
+; CHECK-NEXT: adr r6, .LCPI18_1
+; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q0, [r6]
+; CHECK-NEXT: adr r6, .LCPI18_2
+; CHECK-NEXT: vldrw.u32 q5, [r6]
+; CHECK-NEXT: adr r6, .LCPI18_3
+; CHECK-NEXT: vldrw.u32 q6, [r6]
+; CHECK-NEXT: add r5, sp, #32
+; CHECK-NEXT: add r6, sp, #64
+; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
; CHECK-NEXT: .LBB18_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT: vdup.32 q0, r5
-; CHECK-NEXT: adds r5, #16
-; CHECK-NEXT: vorr q4, q0, q4
-; CHECK-NEXT: vcmp.u32 cs, q1, q4
-; CHECK-NEXT: vpsel q4, q3, q2
-; CHECK-NEXT: vmov r4, r3, d8
-; CHECK-NEXT: vmov.16 q7[0], r4
-; CHECK-NEXT: vmov.16 q7[1], r3
-; CHECK-NEXT: vmov r3, r4, d9
-; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vmov.16 q7[2], r3
-; CHECK-NEXT: vmov.16 q7[3], r4
-; CHECK-NEXT: vorr q4, q0, q4
-; CHECK-NEXT: vcmp.u32 cs, q1, q4
-; CHECK-NEXT: vpsel q4, q3, q2
-; CHECK-NEXT: vmov r3, r4, d8
-; CHECK-NEXT: vmov.16 q7[4], r3
-; CHECK-NEXT: vmov.16 q7[5], r4
-; CHECK-NEXT: vmov r3, r4, d9
-; CHECK-NEXT: vmov.16 q7[6], r3
-; CHECK-NEXT: vmov.16 q7[7], r4
-; CHECK-NEXT: vcmp.i16 ne, q7, zr
-; CHECK-NEXT: vpsel q4, q3, q2
-; CHECK-NEXT: vmov.u16 r3, q4[0]
-; CHECK-NEXT: vmov.8 q7[0], r3
-; CHECK-NEXT: vmov.u16 r3, q4[1]
-; CHECK-NEXT: vmov.8 q7[1], r3
-; CHECK-NEXT: vmov.u16 r3, q4[2]
-; CHECK-NEXT: vmov.8 q7[2], r3
-; CHECK-NEXT: vmov.u16 r3, q4[3]
-; CHECK-NEXT: vmov.8 q7[3], r3
-; CHECK-NEXT: vmov.u16 r3, q4[4]
-; CHECK-NEXT: vmov.8 q7[4], r3
-; CHECK-NEXT: vmov.u16 r3, q4[5]
-; CHECK-NEXT: vmov.8 q7[5], r3
-; CHECK-NEXT: vmov.u16 r3, q4[6]
-; CHECK-NEXT: vmov.8 q7[6], r3
-; CHECK-NEXT: vmov.u16 r3, q4[7]
-; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
-; CHECK-NEXT: vmov.8 q7[7], r3
-; CHECK-NEXT: vorr q4, q0, q4
-; CHECK-NEXT: vorr q0, q0, q6
-; CHECK-NEXT: vcmp.u32 cs, q1, q4
-; CHECK-NEXT: vpsel q5, q3, q2
+; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vdup.32 q7, r3
+; CHECK-NEXT: adds r3, #16
+; CHECK-NEXT: vorr q0, q7, q0
; CHECK-NEXT: vcmp.u32 cs, q1, q0
-; CHECK-NEXT: vmov r3, r4, d10
; CHECK-NEXT: vpsel q0, q3, q2
-; CHECK-NEXT: vmov.16 q4[0], r3
-; CHECK-NEXT: vmov.16 q4[1], r4
-; CHECK-NEXT: vmov r3, r4, d11
-; CHECK-NEXT: vmov.16 q4[2], r3
-; CHECK-NEXT: vmov.16 q4[3], r4
-; CHECK-NEXT: vmov r3, r4, d0
-; CHECK-NEXT: vmov.16 q4[4], r3
-; CHECK-NEXT: vmov.16 q4[5], r4
-; CHECK-NEXT: vmov r3, r4, d1
-; CHECK-NEXT: vmov.16 q4[6], r3
-; CHECK-NEXT: vmov.16 q4[7], r4
-; CHECK-NEXT: vcmp.i16 ne, q4, zr
+; CHECK-NEXT: vstrh.32 q0, [r4, #8]
+; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT: vorr q0, q7, q0
+; CHECK-NEXT: vcmp.u32 cs, q1, q0
+; CHECK-NEXT: vpsel q0, q3, q2
+; CHECK-NEXT: vstrh.32 q0, [r4]
+; CHECK-NEXT: vorr q0, q7, q5
+; CHECK-NEXT: vcmp.u32 cs, q1, q0
; CHECK-NEXT: vpsel q0, q3, q2
-; CHECK-NEXT: vmov.u16 r3, q0[0]
-; CHECK-NEXT: vmov.8 q7[8], r3
-; CHECK-NEXT: vmov.u16 r3, q0[1]
-; CHECK-NEXT: vmov.8 q7[9], r3
-; CHECK-NEXT: vmov.u16 r3, q0[2]
-; CHECK-NEXT: vmov.8 q7[10], r3
-; CHECK-NEXT: vmov.u16 r3, q0[3]
-; CHECK-NEXT: vmov.8 q7[11], r3
-; CHECK-NEXT: vmov.u16 r3, q0[4]
-; CHECK-NEXT: vmov.8 q7[12], r3
-; CHECK-NEXT: vmov.u16 r3, q0[5]
-; CHECK-NEXT: vmov.8 q7[13], r3
-; CHECK-NEXT: vmov.u16 r3, q0[6]
-; CHECK-NEXT: vmov.8 q7[14], r3
-; CHECK-NEXT: vmov.u16 r3, q0[7]
-; CHECK-NEXT: vmov.8 q7[15], r3
-; CHECK-NEXT: vptt.i8 ne, q7, zr
+; CHECK-NEXT: vstrh.32 q0, [r5, #8]
+; CHECK-NEXT: vorr q0, q7, q6
+; CHECK-NEXT: vcmp.u32 cs, q1, q0
+; CHECK-NEXT: vpsel q0, q3, q2
+; CHECK-NEXT: vstrh.32 q0, [r5]
+; CHECK-NEXT: vldrw.u32 q0, [r4]
+; CHECK-NEXT: vcmp.i16 ne, q0, zr
+; CHECK-NEXT: vpsel q0, q3, q2
+; CHECK-NEXT: vstrb.16 q0, [r6, #8]
+; CHECK-NEXT: vldrw.u32 q0, [r5]
+; CHECK-NEXT: vcmp.i16 ne, q0, zr
+; CHECK-NEXT: vpsel q0, q3, q2
+; CHECK-NEXT: vstrb.16 q0, [r6]
+; CHECK-NEXT: vldrw.u32 q0, [r6]
+; CHECK-NEXT: vptt.i8 ne, q0, zr
; CHECK-NEXT: vldrbt.u8 q0, [r0], #16
-; CHECK-NEXT: vldrbt.u8 q4, [r1], #16
-; CHECK-NEXT: vmullt.s8 q5, q4, q0
-; CHECK-NEXT: vmullb.s8 q0, q4, q0
+; CHECK-NEXT: vldrbt.u8 q7, [r1], #16
+; CHECK-NEXT: vmullt.s8 q4, q7, q0
+; CHECK-NEXT: vmullb.s8 q0, q7, q0
; CHECK-NEXT: vqshrnb.s16 q0, q0, #7
-; CHECK-NEXT: vqshrnt.s16 q0, q5, #7
+; CHECK-NEXT: vqshrnt.s16 q0, q4, #7
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrbt.8 q0, [r2], #16
; CHECK-NEXT: le lr, .LBB18_2
; CHECK-NEXT: .LBB18_3: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #48
+; CHECK-NEXT: add sp, #80
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: pop {r4, r5, r6, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.4:
; CHECK-NEXT: .LCPI18_0:
-; CHECK-NEXT: .long 0 @ 0x0
-; CHECK-NEXT: .long 1 @ 0x1
-; CHECK-NEXT: .long 2 @ 0x2
-; CHECK-NEXT: .long 3 @ 0x3
+; CHECK-NEXT: .long 12 @ 0xc
+; CHECK-NEXT: .long 13 @ 0xd
+; CHECK-NEXT: .long 14 @ 0xe
+; CHECK-NEXT: .long 15 @ 0xf
; CHECK-NEXT: .LCPI18_1:
-; CHECK-NEXT: .long 4 @ 0x4
-; CHECK-NEXT: .long 5 @ 0x5
-; CHECK-NEXT: .long 6 @ 0x6
-; CHECK-NEXT: .long 7 @ 0x7
-; CHECK-NEXT: .LCPI18_2:
; CHECK-NEXT: .long 8 @ 0x8
; CHECK-NEXT: .long 9 @ 0x9
; CHECK-NEXT: .long 10 @ 0xa
; CHECK-NEXT: .long 11 @ 0xb
+; CHECK-NEXT: .LCPI18_2:
+; CHECK-NEXT: .long 4 @ 0x4
+; CHECK-NEXT: .long 5 @ 0x5
+; CHECK-NEXT: .long 6 @ 0x6
+; CHECK-NEXT: .long 7 @ 0x7
; CHECK-NEXT: .LCPI18_3:
-; CHECK-NEXT: .long 12 @ 0xc
-; CHECK-NEXT: .long 13 @ 0xd
-; CHECK-NEXT: .long 14 @ 0xe
-; CHECK-NEXT: .long 15 @ 0xf
+; CHECK-NEXT: .long 0 @ 0x0
+; CHECK-NEXT: .long 1 @ 0x1
+; CHECK-NEXT: .long 2 @ 0x2
+; CHECK-NEXT: .long 3 @ 0x3
entry:
%cmp10 = icmp eq i32 %N, 0
br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
@@ -2685,151 +2624,105 @@ for.cond.cleanup: ; preds = %vector.body, %entry
define arm_aapcs_vfpcc void @ssatmul_16ti_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
; CHECK-LABEL: ssatmul_16ti_q7:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: .save {r4, r5, r6, lr}
+; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #48
-; CHECK-NEXT: sub sp, #48
+; CHECK-NEXT: .pad #80
+; CHECK-NEXT: sub sp, #80
; CHECK-NEXT: cmp r3, #0
-; CHECK-NEXT: beq.w .LBB19_3
+; CHECK-NEXT: beq .LBB19_3
; CHECK-NEXT: @ %bb.1: @ %vector.ph
-; CHECK-NEXT: add.w r12, r3, #15
-; CHECK-NEXT: adr r4, .LCPI19_0
-; CHECK-NEXT: bic r12, r12, #15
-; CHECK-NEXT: vldrw.u32 q0, [r4]
-; CHECK-NEXT: sub.w r12, r12, #16
-; CHECK-NEXT: mov.w lr, #1
-; CHECK-NEXT: adr r4, .LCPI19_1
-; CHECK-NEXT: movs r5, #0
-; CHECK-NEXT: add.w lr, lr, r12, lsr #4
-; CHECK-NEXT: sub.w r12, r3, #1
-; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [r4]
-; CHECK-NEXT: adr r4, .LCPI19_2
-; CHECK-NEXT: vdup.32 q1, r12
+; CHECK-NEXT: add.w r6, r3, #15
+; CHECK-NEXT: movs r5, #1
+; CHECK-NEXT: bic r6, r6, #15
+; CHECK-NEXT: add r4, sp, #48
+; CHECK-NEXT: subs r6, #16
; CHECK-NEXT: vmov.i8 q2, #0x0
-; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [r4]
-; CHECK-NEXT: adr r4, .LCPI19_3
; CHECK-NEXT: vmov.i8 q3, #0xff
-; CHECK-NEXT: vldrw.u32 q6, [r4]
+; CHECK-NEXT: add.w lr, r5, r6, lsr #4
+; CHECK-NEXT: adr r5, .LCPI19_0
+; CHECK-NEXT: subs r6, r3, #1
+; CHECK-NEXT: vldrw.u32 q0, [r5]
+; CHECK-NEXT: vdup.32 q1, r6
+; CHECK-NEXT: adr r6, .LCPI19_1
+; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q0, [r6]
+; CHECK-NEXT: adr r6, .LCPI19_2
+; CHECK-NEXT: vldrw.u32 q5, [r6]
+; CHECK-NEXT: adr r6, .LCPI19_3
+; CHECK-NEXT: vldrw.u32 q6, [r6]
+; CHECK-NEXT: add r5, sp, #32
+; CHECK-NEXT: add r6, sp, #64
+; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
; CHECK-NEXT: .LBB19_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT: vdup.32 q0, r5
-; CHECK-NEXT: adds r5, #16
-; CHECK-NEXT: vorr q4, q0, q4
-; CHECK-NEXT: vcmp.u32 cs, q1, q4
-; CHECK-NEXT: vpsel q4, q3, q2
-; CHECK-NEXT: vmov r4, r3, d8
-; CHECK-NEXT: vmov.16 q7[0], r4
-; CHECK-NEXT: vmov.16 q7[1], r3
-; CHECK-NEXT: vmov r3, r4, d9
-; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vmov.16 q7[2], r3
-; CHECK-NEXT: vmov.16 q7[3], r4
-; CHECK-NEXT: vorr q4, q0, q4
-; CHECK-NEXT: vcmp.u32 cs, q1, q4
-; CHECK-NEXT: vpsel q4, q3, q2
-; CHECK-NEXT: vmov r3, r4, d8
-; CHECK-NEXT: vmov.16 q7[4], r3
-; CHECK-NEXT: vmov.16 q7[5], r4
-; CHECK-NEXT: vmov r3, r4, d9
-; CHECK-NEXT: vmov.16 q7[6], r3
-; CHECK-NEXT: vmov.16 q7[7], r4
-; CHECK-NEXT: vcmp.i16 ne, q7, zr
-; CHECK-NEXT: vpsel q4, q3, q2
-; CHECK-NEXT: vmov.u16 r3, q4[0]
-; CHECK-NEXT: vmov.8 q7[0], r3
-; CHECK-NEXT: vmov.u16 r3, q4[1]
-; CHECK-NEXT: vmov.8 q7[1], r3
-; CHECK-NEXT: vmov.u16 r3, q4[2]
-; CHECK-NEXT: vmov.8 q7[2], r3
-; CHECK-NEXT: vmov.u16 r3, q4[3]
-; CHECK-NEXT: vmov.8 q7[3], r3
-; CHECK-NEXT: vmov.u16 r3, q4[4]
-; CHECK-NEXT: vmov.8 q7[4], r3
-; CHECK-NEXT: vmov.u16 r3, q4[5]
-; CHECK-NEXT: vmov.8 q7[5], r3
-; CHECK-NEXT: vmov.u16 r3, q4[6]
-; CHECK-NEXT: vmov.8 q7[6], r3
-; CHECK-NEXT: vmov.u16 r3, q4[7]
-; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
-; CHECK-NEXT: vmov.8 q7[7], r3
-; CHECK-NEXT: vorr q4, q0, q4
-; CHECK-NEXT: vorr q0, q0, q6
-; CHECK-NEXT: vcmp.u32 cs, q1, q4
-; CHECK-NEXT: vpsel q5, q3, q2
+; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vdup.32 q7, r3
+; CHECK-NEXT: adds r3, #16
+; CHECK-NEXT: vorr q0, q7, q0
; CHECK-NEXT: vcmp.u32 cs, q1, q0
-; CHECK-NEXT: vmov r3, r4, d10
; CHECK-NEXT: vpsel q0, q3, q2
-; CHECK-NEXT: vmov.16 q4[0], r3
-; CHECK-NEXT: vmov.16 q4[1], r4
-; CHECK-NEXT: vmov r3, r4, d11
-; CHECK-NEXT: vmov.16 q4[2], r3
-; CHECK-NEXT: vmov.16 q4[3], r4
-; CHECK-NEXT: vmov r3, r4, d0
-; CHECK-NEXT: vmov.16 q4[4], r3
-; CHECK-NEXT: vmov.16 q4[5], r4
-; CHECK-NEXT: vmov r3, r4, d1
-; CHECK-NEXT: vmov.16 q4[6], r3
-; CHECK-NEXT: vmov.16 q4[7], r4
-; CHECK-NEXT: vcmp.i16 ne, q4, zr
+; CHECK-NEXT: vstrh.32 q0, [r4, #8]
+; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT: vorr q0, q7, q0
+; CHECK-NEXT: vcmp.u32 cs, q1, q0
+; CHECK-NEXT: vpsel q0, q3, q2
+; CHECK-NEXT: vstrh.32 q0, [r4]
+; CHECK-NEXT: vorr q0, q7, q5
+; CHECK-NEXT: vcmp.u32 cs, q1, q0
+; CHECK-NEXT: vpsel q0, q3, q2
+; CHECK-NEXT: vstrh.32 q0, [r5, #8]
+; CHECK-NEXT: vorr q0, q7, q6
+; CHECK-NEXT: vcmp.u32 cs, q1, q0
; CHECK-NEXT: vpsel q0, q3, q2
-; CHECK-NEXT: vmov.u16 r3, q0[0]
-; CHECK-NEXT: vmov.8 q7[8], r3
-; CHECK-NEXT: vmov.u16 r3, q0[1]
-; CHECK-NEXT: vmov.8 q7[9], r3
-; CHECK-NEXT: vmov.u16 r3, q0[2]
-; CHECK-NEXT: vmov.8 q7[10], r3
-; CHECK-NEXT: vmov.u16 r3, q0[3]
-; CHECK-NEXT: vmov.8 q7[11], r3
-; CHECK-NEXT: vmov.u16 r3, q0[4]
-; CHECK-NEXT: vmov.8 q7[12], r3
-; CHECK-NEXT: vmov.u16 r3, q0[5]
-; CHECK-NEXT: vmov.8 q7[13], r3
-; CHECK-NEXT: vmov.u16 r3, q0[6]
-; CHECK-NEXT: vmov.8 q7[14], r3
-; CHECK-NEXT: vmov.u16 r3, q0[7]
-; CHECK-NEXT: vmov.8 q7[15], r3
-; CHECK-NEXT: vptt.i8 ne, q7, zr
+; CHECK-NEXT: vstrh.32 q0, [r5]
+; CHECK-NEXT: vldrw.u32 q0, [r4]
+; CHECK-NEXT: vcmp.i16 ne, q0, zr
+; CHECK-NEXT: vpsel q0, q3, q2
+; CHECK-NEXT: vstrb.16 q0, [r6, #8]
+; CHECK-NEXT: vldrw.u32 q0, [r5]
+; CHECK-NEXT: vcmp.i16 ne, q0, zr
+; CHECK-NEXT: vpsel q0, q3, q2
+; CHECK-NEXT: vstrb.16 q0, [r6]
+; CHECK-NEXT: vldrw.u32 q0, [r6]
+; CHECK-NEXT: vptt.i8 ne, q0, zr
; CHECK-NEXT: vldrbt.u8 q0, [r0], #16
-; CHECK-NEXT: vldrbt.u8 q4, [r1], #16
-; CHECK-NEXT: vmullt.s8 q5, q4, q0
-; CHECK-NEXT: vmullb.s8 q0, q4, q0
+; CHECK-NEXT: vldrbt.u8 q7, [r1], #16
+; CHECK-NEXT: vmullt.s8 q4, q7, q0
+; CHECK-NEXT: vmullb.s8 q0, q7, q0
; CHECK-NEXT: vqshrnb.s16 q0, q0, #7
-; CHECK-NEXT: vqshrnt.s16 q0, q5, #7
+; CHECK-NEXT: vqshrnt.s16 q0, q4, #7
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrbt.8 q0, [r2], #16
; CHECK-NEXT: le lr, .LBB19_2
; CHECK-NEXT: .LBB19_3: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #48
+; CHECK-NEXT: add sp, #80
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: pop {r4, r5, r6, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.4:
; CHECK-NEXT: .LCPI19_0:
-; CHECK-NEXT: .long 0 @ 0x0
-; CHECK-NEXT: .long 1 @ 0x1
-; CHECK-NEXT: .long 2 @ 0x2
-; CHECK-NEXT: .long 3 @ 0x3
+; CHECK-NEXT: .long 12 @ 0xc
+; CHECK-NEXT: .long 13 @ 0xd
+; CHECK-NEXT: .long 14 @ 0xe
+; CHECK-NEXT: .long 15 @ 0xf
; CHECK-NEXT: .LCPI19_1:
-; CHECK-NEXT: .long 4 @ 0x4
-; CHECK-NEXT: .long 5 @ 0x5
-; CHECK-NEXT: .long 6 @ 0x6
-; CHECK-NEXT: .long 7 @ 0x7
-; CHECK-NEXT: .LCPI19_2:
; CHECK-NEXT: .long 8 @ 0x8
; CHECK-NEXT: .long 9 @ 0x9
; CHECK-NEXT: .long 10 @ 0xa
; CHECK-NEXT: .long 11 @ 0xb
+; CHECK-NEXT: .LCPI19_2:
+; CHECK-NEXT: .long 4 @ 0x4
+; CHECK-NEXT: .long 5 @ 0x5
+; CHECK-NEXT: .long 6 @ 0x6
+; CHECK-NEXT: .long 7 @ 0x7
; CHECK-NEXT: .LCPI19_3:
-; CHECK-NEXT: .long 12 @ 0xc
-; CHECK-NEXT: .long 13 @ 0xd
-; CHECK-NEXT: .long 14 @ 0xe
-; CHECK-NEXT: .long 15 @ 0xf
+; CHECK-NEXT: .long 0 @ 0x0
+; CHECK-NEXT: .long 1 @ 0x1
+; CHECK-NEXT: .long 2 @ 0x2
+; CHECK-NEXT: .long 3 @ 0x3
entry:
%cmp10 = icmp eq i32 %N, 0
br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
More information about the llvm-commits
mailing list