[llvm] 41d8149 - [ARM] Lower MVETRUNC to stack operations
David Green via llvm-commits
llvm-commits at lists.llvm.org
Sat Jun 26 14:13:12 PDT 2021
Author: David Green
Date: 2021-06-26T22:12:57+01:00
New Revision: 41d8149ee972b8498288b5051a6966cc9e89d57c
URL: https://github.com/llvm/llvm-project/commit/41d8149ee972b8498288b5051a6966cc9e89d57c
DIFF: https://github.com/llvm/llvm-project/commit/41d8149ee972b8498288b5051a6966cc9e89d57c.diff
LOG: [ARM] Lower MVETRUNC to stack operations
The MVETRUNC node truncates two wide vectors to a single vector with
narrower elements. This is usually lowered to a series of extract/insert
elements, going via GPR registers. This patch changes that to instead
use a pair of truncating stores and a stack reload. This cuts down the
number of instructions at the expense of some stack space.
Differential Revision: https://reviews.llvm.org/D104515
Added:
Modified:
llvm/lib/Target/ARM/ARMISelLowering.cpp
llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
llvm/test/CodeGen/Thumb2/mve-sext.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 3b385ba0dec1..cfdff80585c8 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -17217,7 +17217,7 @@ static SDValue PerformBITCASTCombine(SDNode *N,
}
// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
-// node into a buildvector after legalizeOps.
+// node into stack operations after legalizeOps.
SDValue ARMTargetLowering::PerformMVETruncCombine(
SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -17265,7 +17265,14 @@ SDValue ARMTargetLowering::PerformMVETruncCombine(
}
}
- auto LowerToBuildVec = [&]() {
+ // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
+ // truncate to a buildvector to allow the generic optimisations to kick in.
+ if (all_of(N->ops(), [](SDValue Op) {
+ return Op.getOpcode() == ISD::BUILD_VECTOR ||
+ Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
+ (Op.getOpcode() == ISD::BITCAST &&
+ Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
+ })) {
SmallVector<SDValue, 8> Extracts;
for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
SDValue O = N->getOperand(Op);
@@ -17276,26 +17283,40 @@ SDValue ARMTargetLowering::PerformMVETruncCombine(
}
}
return DAG.getBuildVector(VT, DL, Extracts);
- };
-
- // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
- // truncate to a buildvector to allow the generic optimisations to kick in.
- if (all_of(N->ops(), [](SDValue Op) {
- return Op.getOpcode() == ISD::BUILD_VECTOR ||
- Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
- (Op.getOpcode() == ISD::BITCAST &&
- Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
- }))
- return LowerToBuildVec();
+ }
// If we are late in the legalization process and nothing has optimised
- // the trunc to anything better lower it to a series of extracts and a
- // buildvector.
+ // the trunc to anything better, lower it to a stack store and reload,
+ // performing the truncation whilst keeping the lanes in the correct order:
+ // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
if (DCI.isBeforeLegalizeOps())
return SDValue();
- SDValue BuildVec = LowerToBuildVec();
- return LowerBUILD_VECTOR(BuildVec, DCI.DAG, Subtarget);
+ SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4));
+ int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ int NumIns = N->getNumOperands();
+ assert((NumIns == 2 || NumIns == 4) &&
+ "Expected 2 or 4 inputs to an MVETrunc");
+ EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
+ if (N->getNumOperands() == 4)
+ StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
+
+ SmallVector<SDValue> Chains;
+ for (int I = 0; I < NumIns; I++) {
+ SDValue Ptr = DAG.getNode(
+ ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
+ DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
+ MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
+ SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
+ Ptr, MPI, StoreVT, Align(4));
+ Chains.push_back(Ch);
+ }
+
+ SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0);
+ return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
}
SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
index 8e7a7eb5dad9..5542cf1f9cd6 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
@@ -275,105 +275,88 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @ext_add_ashr_trunc_i8i32(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: ext_add_ashr_trunc_i8i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: vmov.u8 r0, q1[14]
-; CHECK-NEXT: vmov.u8 r1, q1[12]
-; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT: vmov.u8 r0, q1[15]
-; CHECK-NEXT: vmov.u8 r1, q1[13]
+; CHECK-NEXT: .vsave {d8, d9}
+; CHECK-NEXT: vpush {d8, d9}
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: vmov.u8 r1, q1[14]
+; CHECK-NEXT: vmov.u8 r2, q1[12]
+; CHECK-NEXT: vmov q3[2], q3[0], r2, r1
+; CHECK-NEXT: vmov.u8 r1, q1[15]
+; CHECK-NEXT: vmov.u8 r2, q1[13]
; CHECK-NEXT: vmov.i32 q2, #0xff
-; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT: vmov.u8 r0, q0[14]
-; CHECK-NEXT: vmov.u8 r1, q0[12]
+; CHECK-NEXT: vmov q3[3], q3[1], r2, r1
+; CHECK-NEXT: vmov.u8 r1, q0[14]
+; CHECK-NEXT: vmov.u8 r2, q0[12]
; CHECK-NEXT: vand q3, q3, q2
-; CHECK-NEXT: vmov q4[2], q4[0], r1, r0
-; CHECK-NEXT: vmov.u8 r0, q0[15]
-; CHECK-NEXT: vmov.u8 r1, q0[13]
-; CHECK-NEXT: vmov.u8 r4, q1[6]
-; CHECK-NEXT: vmov q4[3], q4[1], r1, r0
-; CHECK-NEXT: vmov.u8 r0, q1[2]
+; CHECK-NEXT: vmov q4[2], q4[0], r2, r1
+; CHECK-NEXT: vmov.u8 r1, q0[15]
+; CHECK-NEXT: vmov.u8 r2, q0[13]
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: vmov q4[3], q4[1], r2, r1
+; CHECK-NEXT: vmov.u8 r1, q1[10]
; CHECK-NEXT: vmovlb.s8 q4, q4
-; CHECK-NEXT: vmov.u8 r1, q1[0]
+; CHECK-NEXT: vmov.u8 r2, q1[8]
; CHECK-NEXT: vmovlb.s16 q4, q4
-; CHECK-NEXT: vmov.u8 r5, q1[4]
; CHECK-NEXT: vadd.i32 q3, q4, q3
; CHECK-NEXT: vshr.u32 q3, q3, #1
-; CHECK-NEXT: vmov lr, r12, d7
-; CHECK-NEXT: vmov r3, r2, d6
-; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT: vmov.u8 r0, q1[3]
-; CHECK-NEXT: vmov.u8 r1, q1[1]
-; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT: vmov.u8 r0, q0[2]
-; CHECK-NEXT: vmov.u8 r1, q0[0]
+; CHECK-NEXT: vstrb.32 q3, [r0, #12]
+; CHECK-NEXT: vmov q3[2], q3[0], r2, r1
+; CHECK-NEXT: vmov.u8 r1, q1[11]
+; CHECK-NEXT: vmov.u8 r2, q1[9]
+; CHECK-NEXT: vmov q3[3], q3[1], r2, r1
+; CHECK-NEXT: vmov.u8 r1, q0[10]
+; CHECK-NEXT: vmov.u8 r2, q0[8]
; CHECK-NEXT: vand q3, q3, q2
-; CHECK-NEXT: vmov q4[2], q4[0], r1, r0
-; CHECK-NEXT: vmov.u8 r0, q0[3]
-; CHECK-NEXT: vmov.u8 r1, q0[1]
-; CHECK-NEXT: vmov q4[3], q4[1], r1, r0
+; CHECK-NEXT: vmov q4[2], q4[0], r2, r1
+; CHECK-NEXT: vmov.u8 r1, q0[11]
+; CHECK-NEXT: vmov.u8 r2, q0[9]
+; CHECK-NEXT: vmov q4[3], q4[1], r2, r1
+; CHECK-NEXT: vmov.u8 r1, q1[6]
; CHECK-NEXT: vmovlb.s8 q4, q4
+; CHECK-NEXT: vmov.u8 r2, q1[4]
; CHECK-NEXT: vmovlb.s16 q4, q4
; CHECK-NEXT: vadd.i32 q3, q4, q3
-; CHECK-NEXT: vmov q4[2], q4[0], r5, r4
-; CHECK-NEXT: vmov.u8 r4, q1[7]
-; CHECK-NEXT: vmov.u8 r5, q1[5]
-; CHECK-NEXT: vmov q4[3], q4[1], r5, r4
-; CHECK-NEXT: vmov.u8 r4, q0[6]
-; CHECK-NEXT: vmov.u8 r5, q0[4]
; CHECK-NEXT: vshr.u32 q3, q3, #1
-; CHECK-NEXT: vmov q5[2], q5[0], r5, r4
-; CHECK-NEXT: vmov.u8 r4, q0[7]
-; CHECK-NEXT: vmov.u8 r5, q0[5]
-; CHECK-NEXT: vand q4, q4, q2
-; CHECK-NEXT: vmov q5[3], q5[1], r5, r4
-; CHECK-NEXT: vmov.u8 r4, q0[10]
-; CHECK-NEXT: vmovlb.s8 q5, q5
-; CHECK-NEXT: vmov.u8 r5, q0[8]
-; CHECK-NEXT: vmovlb.s16 q5, q5
-; CHECK-NEXT: vmov r1, r0, d6
-; CHECK-NEXT: vadd.i32 q4, q5, q4
-; CHECK-NEXT: vmov q5[2], q5[0], r5, r4
-; CHECK-NEXT: vmov.u8 r4, q0[11]
-; CHECK-NEXT: vmov.u8 r5, q0[9]
-; CHECK-NEXT: vmov q5[3], q5[1], r5, r4
-; CHECK-NEXT: vmov.8 q0[0], r1
-; CHECK-NEXT: vmov.u8 r4, q1[10]
-; CHECK-NEXT: vmov.u8 r5, q1[8]
-; CHECK-NEXT: vmov q6[2], q6[0], r5, r4
-; CHECK-NEXT: vmov.8 q0[1], r0
-; CHECK-NEXT: vmov r0, r1, d7
-; CHECK-NEXT: vmov.u8 r4, q1[11]
-; CHECK-NEXT: vmov.u8 r5, q1[9]
-; CHECK-NEXT: vmov.8 q0[2], r0
-; CHECK-NEXT: vmov q6[3], q6[1], r5, r4
-; CHECK-NEXT: vshr.u32 q4, q4, #1
-; CHECK-NEXT: vmov.8 q0[3], r1
-; CHECK-NEXT: vmov r0, r1, d8
-; CHECK-NEXT: vand q1, q6, q2
-; CHECK-NEXT: vmovlb.s8 q2, q5
-; CHECK-NEXT: vmov.8 q0[4], r0
-; CHECK-NEXT: vmovlb.s16 q2, q2
-; CHECK-NEXT: vadd.i32 q1, q2, q1
-; CHECK-NEXT: vmov r4, r5, d9
-; CHECK-NEXT: vmov.8 q0[5], r1
-; CHECK-NEXT: vshr.u32 q1, q1, #1
-; CHECK-NEXT: vmov.8 q0[6], r4
-; CHECK-NEXT: vmov r0, r1, d3
-; CHECK-NEXT: vmov.8 q0[7], r5
-; CHECK-NEXT: vmov r4, r5, d2
-; CHECK-NEXT: vmov.8 q0[8], r4
-; CHECK-NEXT: vmov.8 q0[9], r5
-; CHECK-NEXT: vmov.8 q0[10], r0
-; CHECK-NEXT: vmov.8 q0[11], r1
-; CHECK-NEXT: vmov.8 q0[12], r3
-; CHECK-NEXT: vmov.8 q0[13], r2
-; CHECK-NEXT: vmov.8 q0[14], lr
-; CHECK-NEXT: vmov.8 q0[15], r12
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: vstrb.32 q3, [r0, #8]
+; CHECK-NEXT: vmov q3[2], q3[0], r2, r1
+; CHECK-NEXT: vmov.u8 r1, q1[7]
+; CHECK-NEXT: vmov.u8 r2, q1[5]
+; CHECK-NEXT: vmov q3[3], q3[1], r2, r1
+; CHECK-NEXT: vmov.u8 r1, q0[6]
+; CHECK-NEXT: vmov.u8 r2, q0[4]
+; CHECK-NEXT: vand q3, q3, q2
+; CHECK-NEXT: vmov q4[2], q4[0], r2, r1
+; CHECK-NEXT: vmov.u8 r1, q0[7]
+; CHECK-NEXT: vmov.u8 r2, q0[5]
+; CHECK-NEXT: vmov q4[3], q4[1], r2, r1
+; CHECK-NEXT: vmov.u8 r1, q1[2]
+; CHECK-NEXT: vmovlb.s8 q4, q4
+; CHECK-NEXT: vmov.u8 r2, q1[0]
+; CHECK-NEXT: vmovlb.s16 q4, q4
+; CHECK-NEXT: vadd.i32 q3, q4, q3
+; CHECK-NEXT: vshr.u32 q3, q3, #1
+; CHECK-NEXT: vstrb.32 q3, [r0, #4]
+; CHECK-NEXT: vmov q3[2], q3[0], r2, r1
+; CHECK-NEXT: vmov.u8 r1, q1[3]
+; CHECK-NEXT: vmov.u8 r2, q1[1]
+; CHECK-NEXT: vmov q3[3], q3[1], r2, r1
+; CHECK-NEXT: vmov.u8 r1, q0[2]
+; CHECK-NEXT: vmov.u8 r2, q0[0]
+; CHECK-NEXT: vand q1, q3, q2
+; CHECK-NEXT: vmov q2[2], q2[0], r2, r1
+; CHECK-NEXT: vmov.u8 r1, q0[3]
+; CHECK-NEXT: vmov.u8 r2, q0[1]
+; CHECK-NEXT: vmov q2[3], q2[1], r2, r1
+; CHECK-NEXT: vmovlb.s8 q0, q2
+; CHECK-NEXT: vmovlb.s16 q0, q0
+; CHECK-NEXT: vadd.i32 q0, q0, q1
+; CHECK-NEXT: vshr.u32 q0, q0, #1
+; CHECK-NEXT: vstrb.32 q0, [r0]
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: add sp, #16
+; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: bx lr
entry:
%sa = sext <16 x i8> %a to <16 x i32>
%sb = zext <16 x i8> %b to <16 x i32>
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
index 248ba3e95de3..f549cebe304e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
@@ -362,23 +362,16 @@ entry:
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) {
; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
-; CHECK-NEXT: vmov r4, r5, d0
-; CHECK-NEXT: vmov.16 q2[0], r4
-; CHECK-NEXT: vmov lr, r12, d3
-; CHECK-NEXT: vmov r3, r2, d2
-; CHECK-NEXT: vldrb.u16 q1, [r1]
-; CHECK-NEXT: vmov r1, r4, d1
-; CHECK-NEXT: vmov.16 q2[1], r5
-; CHECK-NEXT: vmov.16 q2[2], r1
-; CHECK-NEXT: vmov.16 q2[3], r4
-; CHECK-NEXT: vmov.16 q2[4], r3
-; CHECK-NEXT: vmov.16 q2[5], r2
-; CHECK-NEXT: vmov.16 q2[6], lr
-; CHECK-NEXT: vmov.16 q2[7], r12
-; CHECK-NEXT: vstrh.16 q2, [r0, q1]
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: mov r2, sp
+; CHECK-NEXT: vstrh.32 q1, [r2, #8]
+; CHECK-NEXT: vstrh.32 q0, [r2]
+; CHECK-NEXT: vldrb.u16 q0, [r1]
+; CHECK-NEXT: vldrw.u32 q1, [r2]
+; CHECK-NEXT: vstrh.16 q1, [r0, q0]
+; CHECK-NEXT: add sp, #16
+; CHECK-NEXT: bx lr
entry:
%offs = load <8 x i8>, <8 x i8>* %offptr, align 1
%offs.zext = zext <8 x i8> %offs to <8 x i32>
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
index fe7dce78d4d8..99ac3e993e56 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
@@ -374,38 +374,18 @@ entry:
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <16 x i8>* %offptr, <16 x i32> %input) {
; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
-; CHECK-NEXT: vmov r4, r5, d0
-; CHECK-NEXT: vmov.8 q4[0], r4
-; CHECK-NEXT: vmov lr, r12, d7
-; CHECK-NEXT: vmov r3, r2, d6
-; CHECK-NEXT: vldrb.u8 q3, [r1]
-; CHECK-NEXT: vmov r1, r4, d1
-; CHECK-NEXT: vmov.8 q4[1], r5
-; CHECK-NEXT: vmov.8 q4[2], r1
-; CHECK-NEXT: vmov r1, r5, d2
-; CHECK-NEXT: vmov.8 q4[3], r4
-; CHECK-NEXT: vmov.8 q4[4], r1
-; CHECK-NEXT: vmov r1, r4, d3
-; CHECK-NEXT: vmov.8 q4[5], r5
-; CHECK-NEXT: vmov.8 q4[6], r1
-; CHECK-NEXT: vmov r1, r5, d4
-; CHECK-NEXT: vmov.8 q4[7], r4
-; CHECK-NEXT: vmov.8 q4[8], r1
-; CHECK-NEXT: vmov r1, r4, d5
-; CHECK-NEXT: vmov.8 q4[9], r5
-; CHECK-NEXT: vmov.8 q4[10], r1
-; CHECK-NEXT: vmov.8 q4[11], r4
-; CHECK-NEXT: vmov.8 q4[12], r3
-; CHECK-NEXT: vmov.8 q4[13], r2
-; CHECK-NEXT: vmov.8 q4[14], lr
-; CHECK-NEXT: vmov.8 q4[15], r12
-; CHECK-NEXT: vstrb.8 q4, [r0, q3]
-; CHECK-NEXT: vpop {d8, d9}
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: mov r2, sp
+; CHECK-NEXT: vstrb.32 q3, [r2, #12]
+; CHECK-NEXT: vstrb.32 q2, [r2, #8]
+; CHECK-NEXT: vstrb.32 q1, [r2, #4]
+; CHECK-NEXT: vstrb.32 q0, [r2]
+; CHECK-NEXT: vldrb.u8 q0, [r1]
+; CHECK-NEXT: vldrw.u32 q1, [r2]
+; CHECK-NEXT: vstrb.8 q1, [r0, q0]
+; CHECK-NEXT: add sp, #16
+; CHECK-NEXT: bx lr
entry:
%offs = load <16 x i8>, <16 x i8>* %offptr, align 1
%offs.zext = zext <16 x i8> %offs to <16 x i32>
@@ -418,40 +398,15 @@ entry:
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <16 x i8>* %offptr, <16 x i16> %input) {
; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.u16 r3, q0[0]
-; CHECK-NEXT: vmov.u16 r2, q1[7]
-; CHECK-NEXT: vmov.8 q2[0], r3
-; CHECK-NEXT: vmov.u16 r3, q0[1]
-; CHECK-NEXT: vmov.8 q2[1], r3
-; CHECK-NEXT: vmov.u16 r3, q0[2]
-; CHECK-NEXT: vmov.8 q2[2], r3
-; CHECK-NEXT: vmov.u16 r3, q0[3]
-; CHECK-NEXT: vmov.8 q2[3], r3
-; CHECK-NEXT: vmov.u16 r3, q0[4]
-; CHECK-NEXT: vmov.8 q2[4], r3
-; CHECK-NEXT: vmov.u16 r3, q0[5]
-; CHECK-NEXT: vmov.8 q2[5], r3
-; CHECK-NEXT: vmov.u16 r3, q0[6]
-; CHECK-NEXT: vmov.8 q2[6], r3
-; CHECK-NEXT: vmov.u16 r3, q0[7]
-; CHECK-NEXT: vmov.8 q2[7], r3
-; CHECK-NEXT: vmov.u16 r3, q1[0]
-; CHECK-NEXT: vmov.8 q2[8], r3
-; CHECK-NEXT: vmov.u16 r3, q1[1]
-; CHECK-NEXT: vmov.8 q2[9], r3
-; CHECK-NEXT: vmov.u16 r3, q1[2]
-; CHECK-NEXT: vmov.8 q2[10], r3
-; CHECK-NEXT: vmov.u16 r3, q1[3]
-; CHECK-NEXT: vmov.8 q2[11], r3
-; CHECK-NEXT: vmov.u16 r3, q1[4]
-; CHECK-NEXT: vmov.8 q2[12], r3
-; CHECK-NEXT: vmov.u16 r3, q1[5]
-; CHECK-NEXT: vmov.8 q2[13], r3
-; CHECK-NEXT: vmov.u16 r3, q1[6]
-; CHECK-NEXT: vmov.8 q2[14], r3
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: mov r2, sp
+; CHECK-NEXT: vstrb.16 q1, [r2, #8]
+; CHECK-NEXT: vstrb.16 q0, [r2]
; CHECK-NEXT: vldrb.u8 q0, [r1]
-; CHECK-NEXT: vmov.8 q2[15], r2
-; CHECK-NEXT: vstrb.8 q2, [r0, q0]
+; CHECK-NEXT: vldrw.u32 q1, [r2]
+; CHECK-NEXT: vstrb.8 q1, [r0, q0]
+; CHECK-NEXT: add sp, #16
; CHECK-NEXT: bx lr
entry:
%offs = load <16 x i8>, <16 x i8>* %offptr, align 1
diff --git a/llvm/test/CodeGen/Thumb2/mve-sext.ll b/llvm/test/CodeGen/Thumb2/mve-sext.ll
index 59e66f7abe56..da804a528551 100644
--- a/llvm/test/CodeGen/Thumb2/mve-sext.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-sext.ll
@@ -434,39 +434,13 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @trunc_v16i16_v16i8(<16 x i16> %src) {
; CHECK-LABEL: trunc_v16i16_v16i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov q2, q0
-; CHECK-NEXT: vmov.u16 r0, q0[0]
-; CHECK-NEXT: vmov.8 q0[0], r0
-; CHECK-NEXT: vmov.u16 r0, q2[1]
-; CHECK-NEXT: vmov.8 q0[1], r0
-; CHECK-NEXT: vmov.u16 r0, q2[2]
-; CHECK-NEXT: vmov.8 q0[2], r0
-; CHECK-NEXT: vmov.u16 r0, q2[3]
-; CHECK-NEXT: vmov.8 q0[3], r0
-; CHECK-NEXT: vmov.u16 r0, q2[4]
-; CHECK-NEXT: vmov.8 q0[4], r0
-; CHECK-NEXT: vmov.u16 r0, q2[5]
-; CHECK-NEXT: vmov.8 q0[5], r0
-; CHECK-NEXT: vmov.u16 r0, q2[6]
-; CHECK-NEXT: vmov.8 q0[6], r0
-; CHECK-NEXT: vmov.u16 r0, q2[7]
-; CHECK-NEXT: vmov.8 q0[7], r0
-; CHECK-NEXT: vmov.u16 r0, q1[0]
-; CHECK-NEXT: vmov.8 q0[8], r0
-; CHECK-NEXT: vmov.u16 r0, q1[1]
-; CHECK-NEXT: vmov.8 q0[9], r0
-; CHECK-NEXT: vmov.u16 r0, q1[2]
-; CHECK-NEXT: vmov.8 q0[10], r0
-; CHECK-NEXT: vmov.u16 r0, q1[3]
-; CHECK-NEXT: vmov.8 q0[11], r0
-; CHECK-NEXT: vmov.u16 r0, q1[4]
-; CHECK-NEXT: vmov.8 q0[12], r0
-; CHECK-NEXT: vmov.u16 r0, q1[5]
-; CHECK-NEXT: vmov.8 q0[13], r0
-; CHECK-NEXT: vmov.u16 r0, q1[6]
-; CHECK-NEXT: vmov.8 q0[14], r0
-; CHECK-NEXT: vmov.u16 r0, q1[7]
-; CHECK-NEXT: vmov.8 q0[15], r0
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: vstrb.16 q1, [r0, #8]
+; CHECK-NEXT: vstrb.16 q0, [r0]
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: add sp, #16
; CHECK-NEXT: bx lr
entry:
%0 = trunc <16 x i16> %src to <16 x i8>
@@ -476,19 +450,13 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @trunc_v8i32_v8i16(<8 x i32> %src) {
; CHECK-LABEL: trunc_v8i32_v8i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov q2, q0
-; CHECK-NEXT: vmov r0, r1, d4
-; CHECK-NEXT: vmov.16 q0[0], r0
-; CHECK-NEXT: vmov.16 q0[1], r1
-; CHECK-NEXT: vmov r0, r1, d5
-; CHECK-NEXT: vmov.16 q0[2], r0
-; CHECK-NEXT: vmov.16 q0[3], r1
-; CHECK-NEXT: vmov r0, r1, d2
-; CHECK-NEXT: vmov.16 q0[4], r0
-; CHECK-NEXT: vmov.16 q0[5], r1
-; CHECK-NEXT: vmov r0, r1, d3
-; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: vmov.16 q0[7], r1
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: vstrh.32 q1, [r0, #8]
+; CHECK-NEXT: vstrh.32 q0, [r0]
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: add sp, #16
; CHECK-NEXT: bx lr
entry:
%0 = trunc <8 x i32> %src to <8 x i16>
@@ -498,34 +466,15 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @trunc_v16i32_v16i8(<16 x i32> %src) {
; CHECK-LABEL: trunc_v16i32_v16i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
-; CHECK-NEXT: vmov q4, q0
-; CHECK-NEXT: vmov r0, r1, d8
-; CHECK-NEXT: vmov.8 q0[0], r0
-; CHECK-NEXT: vmov.8 q0[1], r1
-; CHECK-NEXT: vmov r0, r1, d9
-; CHECK-NEXT: vmov.8 q0[2], r0
-; CHECK-NEXT: vmov.8 q0[3], r1
-; CHECK-NEXT: vmov r0, r1, d2
-; CHECK-NEXT: vmov.8 q0[4], r0
-; CHECK-NEXT: vmov.8 q0[5], r1
-; CHECK-NEXT: vmov r0, r1, d3
-; CHECK-NEXT: vmov.8 q0[6], r0
-; CHECK-NEXT: vmov.8 q0[7], r1
-; CHECK-NEXT: vmov r0, r1, d4
-; CHECK-NEXT: vmov.8 q0[8], r0
-; CHECK-NEXT: vmov.8 q0[9], r1
-; CHECK-NEXT: vmov r0, r1, d5
-; CHECK-NEXT: vmov.8 q0[10], r0
-; CHECK-NEXT: vmov.8 q0[11], r1
-; CHECK-NEXT: vmov r0, r1, d6
-; CHECK-NEXT: vmov.8 q0[12], r0
-; CHECK-NEXT: vmov.8 q0[13], r1
-; CHECK-NEXT: vmov r0, r1, d7
-; CHECK-NEXT: vmov.8 q0[14], r0
-; CHECK-NEXT: vmov.8 q0[15], r1
-; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: vstrb.32 q3, [r0, #12]
+; CHECK-NEXT: vstrb.32 q2, [r0, #8]
+; CHECK-NEXT: vstrb.32 q1, [r0, #4]
+; CHECK-NEXT: vstrb.32 q0, [r0]
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: add sp, #16
; CHECK-NEXT: bx lr
entry:
%0 = trunc <16 x i32> %src to <16 x i8>
More information about the llvm-commits
mailing list